コード例 #1
0
ファイル: bt_curnext.c プロジェクト: 7segments/mongo-1
/*
 * __wt_btcur_iterate_setup --
 *	Initialize a cursor for iteration, usually based on a search.
 */
void
__wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next)
{
	WT_PAGE *page;

	WT_UNUSED(next);

	/*
	 * We don't currently have to do any setup when we switch between next
	 * and prev calls, but I'm sure we will someday -- I'm leaving support
	 * here for both flags for that reason.
	 */
	F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);

	/*
	 * Clear the count of deleted items on the page.
	 */
	cbt->page_deleted_count = 0;

	/*
	 * If we don't have a search page, then we're done, we're starting at
	 * the beginning or end of the tree, not as a result of a search.
	 */
	if (cbt->ref == NULL)
		return;
	page = cbt->ref->page;

	if (page->type == WT_PAGE_ROW_LEAF) {
		/*
		 * For row-store pages, we need a single item that tells us the
		 * part of the page we're walking (otherwise switching from next
		 * to prev and vice-versa is just too complicated), so we map
		 * the WT_ROW and WT_INSERT_HEAD insert array slots into a
		 * single name space: slot 1 is the "smallest key insert list",
		 * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on.
		 * This means WT_INSERT lists are odd-numbered slots, and WT_ROW
		 * array slots are even-numbered slots.
		 */
		cbt->row_iteration_slot = (cbt->slot + 1) * 2;
		if (cbt->ins_head != NULL) {
			if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page))
				cbt->row_iteration_slot = 1;
			else
				cbt->row_iteration_slot += 1;
		}
	} else {
		/*
		 * For column-store pages, calculate the largest record on the
		 * page.
		 */
		cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ?
		    __col_var_last_recno(page) : __col_fix_last_recno(page);

		/* If we're traversing the append list, set the reference. */
		if (cbt->ins_head != NULL &&
		    cbt->ins_head == WT_COL_APPEND(page))
			F_SET(cbt, WT_CBT_ITERATE_APPEND);
	}
}
コード例 #2
0
ファイル: bt_curnext.c プロジェクト: rain10154/wiredtiger
/*为一个检索操作初始化一个查询的cursor*/
void __wt_btcur_iterate_setup(WT_CURSOR_BTREE* cbt, int next)
{
	WT_PAGE *page;
	WT_UNUSED(next);

	/*
	* We don't currently have to do any setup when we switch between next
	* and prev calls, but I'm sure we will someday -- I'm leaving support
	* here for both flags for that reason.
	*/
	F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);

	/*初始化del count计数器,在检索的过程需要统计del count*/
	cbt->page_deleted_count = 0;

	if (cbt->ref == NULL)
		return;

	page = cbt->ref->page;

	/*行存储叶子节点*/
	if (page->type == WT_PAGE_ROW_LEAF){
		/*指定iteration slot*/
		/*
		* For row-store pages, we need a single item that tells us the
		* part of the page we're walking (otherwise switching from next
		* to prev and vice-versa is just too complicated), so we map
		* the WT_ROW and WT_INSERT_HEAD insert array slots into a
		* single name space: slot 1 is the "smallest key insert list",
		* slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on.
		* This means WT_INSERT lists are odd-numbered slots, and WT_ROW
		* array slots are even-numbered slots.
		*/
		cbt->row_iteration_slot = (cbt->slot + 1) * 2;
		if (cbt->ins_head != NULL){
			if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page))
				cbt->row_iteration_slot = 1;
			else
				cbt->row_iteration_slot += 1;
		}
	}
	else{ /*column store page, 计算这个page中最大的记录序号*/
		cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page);
		if (cbt->ins_head != NULL && cbt->ins_head == WT_COL_APPEND(page))
			F_SET(cbt, WT_CBT_ITERATE_APPEND);
	}
}
コード例 #3
0
ファイル: bt_curprev.c プロジェクト: mullingitover/mongo
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t rle_start;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	rle_start = 0;			/* -Werror=maybe-uninitialized */

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < page->pg_var_recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip =
		    __col_var_search(page, cbt->recno, &rle_start)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (__wt_txn_visible_all(session, upd->txnid))
					++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL) {
				if (__wt_cell_rle(&unpack) == 1)
					continue;
				/*
				 * There can be huge gaps in the variable-length
				 * column-store name space appearing as deleted
				 * records. If more than one deleted record, do
				 * the work of finding the next record to return
				 * instead of looping through the records.
				 *
				 * First, find the largest record in the update
				 * list that's smaller than the current record.
				 */
				ins = __col_insert_search_lt(
				    cbt->ins_head, cbt->recno);

				/*
				 * Second, for records with RLEs greater than 1,
				 * the above call to __col_var_search located
				 * this record in the page's list of repeating
				 * records, and returned the starting record.
				 * The starting record - 1 is the record to
				 * which we could skip, if there was no larger
				 * record in the update list.
				 */
				cbt->recno = rle_start - 1;
				if (ins != NULL &&
				    WT_INSERT_RECNO(ins) > cbt->recno)
					cbt->recno = WT_INSERT_RECNO(ins);

				/* Adjust for the outer loop decrement. */
				++cbt->recno;
				continue;
			}
			WT_RET(__wt_page_cell_data_ref(
			    session, page, &unpack, cbt->tmp));

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp->data;
		val->size = cbt->tmp->size;
		return (0);
	}
	/* NOTREACHED */
}
コード例 #4
0
ファイル: bt_curprev.c プロジェクト: ForNowForever/mongo
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < page->pg_var_recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip = __col_var_search(page, cbt->recno)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL)
				continue;
			WT_RET(__wt_page_cell_data_ref(
			    session, page, &unpack, &cbt->tmp));

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return (0);
	}
	/* NOTREACHED */
}
コード例 #5
0
ファイル: bt_curnext.c プロジェクト: rain10154/wiredtiger
/*移向下条variable-length column-store 记录*/
static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_ITEM *val;
	WT_INSERT *ins;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t rle, rle_start;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	rle_start = 0;			/* -Werror=maybe-uninitialized */

	if (newpage){
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, page->pg_var_recno);
		goto new_page;
	}

	for (;;){
		if (cbt->recno >= cbt->last_standard_recno)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->recno + 1);

	new_page:
		/*定位到recno对应的WT_COL slot*/
		if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/*读取内容值*/
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		* If we're at the same slot as the last reference and there's
		* no matching insert list item, re-use the return information
		* (so encoded items with large repeat counts aren't repeatedly
		* decoded).  Otherwise, unpack the cell and build the return
		* information.
		* upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值
		*/

		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL) {
				if ((rle = __wt_cell_rle(&unpack)) == 1)
					continue;

				/*定位到修改列表中的记录*/
				ins = __col_insert_search_gt(cbt->ins_head, cbt->recno);
				cbt->recno = rle_start + rle;
				if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno)
					cbt->recno = WT_INSERT_RECNO(ins);

				/* Adjust for the outer loop increment. */
				--cbt->recno;
				continue;
			}

			/*取出cell中的值到tmp中*/
			WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp));
			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return 0;
	}
}
コード例 #6
0
ファイル: col_modify.c プロジェクト: Andiry/mongo
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
    uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins;
	WT_INSERT_HEAD *ins_head, **ins_headp;
	WT_ITEM _value;
	WT_PAGE *page;
	WT_UPDATE *old_upd;
	size_t ins_size, upd_size;
	u_int i, skipdepth;
	int append, logged;

	btree = cbt->btree;
	ins = NULL;
	page = cbt->ref->page;
	append = logged = 0;

	/* This code expects a remove to have a NULL value. */
	if (is_remove) {
		if (btree->type == BTREE_COL_FIX) {
			value = &_value;
			value->data = "";
			value->size = 1;
		} else
			value = NULL;
	} else {
		/*
		 * There's some chance the application specified a record past
		 * the last record on the page.  If that's the case, and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list. In addition, a recno of 0
		 * implies an append operation, we're allocating a new row.
		 */
		if (recno == 0 ||
		    recno > (btree->type == BTREE_COL_VAR ?
		    __col_var_last_recno(page) : __col_fix_last_recno(page)))
			append = 1;
	}

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, create a new WT_UPDATE
	 * entry and have a serialized function link it into an existing
	 * WT_INSERT entry's WT_UPDATE list.
	 *
	 * Else, allocate an insert array as necessary, build a WT_INSERT and
	 * WT_UPDATE structure pair, and call a serialized function to insert
	 * the WT_INSERT structure.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/*
		 * If we are restoring updates that couldn't be evicted, the
		 * key must not exist on the new page.
		 */
		WT_ASSERT(session, upd == NULL);

		/* Make sure the update can proceed. */
		WT_ERR(__wt_txn_update_check(
		    session, old_upd = cbt->ins->upd));

		/* Allocate a WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
		WT_ERR(__wt_txn_modify(session, upd));
		logged = 1;

		/* Avoid a data copy in WT_CURSOR.update. */
		cbt->modify_update = upd;

		/*
		 * Point the new WT_UPDATE item to the next element in the list.
		 * If we get it right, the serialization function lock acts as
		 * our memory barrier to flush this write.
		 */
		upd->next = old_upd;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(
		    session, page, &cbt->ins->upd, &upd, upd_size));
	} else {
		/* Allocate the append/update list reference as necessary. */
		if (append) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_append, ins_headp, 1);
			ins_headp = &page->modify->mod_append[0];
		} else if (page->type == WT_PAGE_COL_FIX) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_update, ins_headp, 1);
			ins_headp = &page->modify->mod_update[0];
		} else {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, page->modify->mod_update, ins_headp,
			    page->pg_var_entries);
			ins_headp = &page->modify->mod_update[cbt->slot];
		}

		/* Allocate the WT_INSERT_HEAD structure as necessary. */
		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
		ins_head = *ins_headp;

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth(session);

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it (the WT_INSERT_HEAD might
		 * be allocated, the WT_INSERT was allocated).
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		cbt->ins_head = ins_head;
		cbt->ins = ins;

		if (upd == NULL) {
			WT_ERR(
			    __wt_update_alloc(session, value, &upd, &upd_size));
			WT_ERR(__wt_txn_modify(session, upd));
			logged = 1;

			/* Avoid a data copy in WT_CURSOR.update. */
			cbt->modify_update = upd;
		} else
			upd_size = __wt_update_list_memsize(upd);
		ins->upd = upd;
		ins_size += upd_size;

		/*
		 * If there was no insert list during the search, or there was
		 * no search because the record number has not been allocated
		 * yet, the cursor's information cannot be correct, search
		 * couldn't have initialized it.
		 *
		 * Otherwise, point the new WT_INSERT item's skiplist to the
		 * next elements in the insert list (which we will check are
		 * still valid inside the serialization function).
		 *
		 * The serial mutex acts as our memory barrier to flush these
		 * writes before inserting them into the list.
		 */
		if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
			for (i = 0; i < skipdepth; i++) {
				cbt->ins_stack[i] = &ins_head->head[i];
				ins->next[i] = cbt->next_stack[i] = NULL;
			}
		else
			for (i = 0; i < skipdepth; i++)
				ins->next[i] = cbt->next_stack[i];

		/* Append or insert the WT_INSERT structure. */
		if (append)
			WT_ERR(__wt_col_append_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, &cbt->recno, skipdepth));
		else
			WT_ERR(__wt_insert_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, skipdepth));
	}

	/* If the update was successful, add it to the in-memory log. */
	if (logged)
		WT_ERR(__wt_txn_log_op(session, cbt));

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		__wt_free(session, upd);
	}

	return (ret);
}
コード例 #7
0
ファイル: col_modify.c プロジェクト: ajdavis/mongo
/*
 * __wt_col_modify --
 *	Column-store delete, insert, and update.
 */
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
    uint64_t recno, const WT_ITEM *value,
    WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
{
	static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 };
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_INSERT *ins;
	WT_INSERT_HEAD *ins_head, **ins_headp;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_UPDATE *old_upd, *upd;
	size_t ins_size, upd_size;
	u_int i, skipdepth;
	bool append, logged;

	btree = cbt->btree;
	ins = NULL;
	page = cbt->ref->page;
	upd = upd_arg;
	append = logged = false;

	if (upd_arg == NULL) {
		if (modify_type == WT_UPDATE_RESERVE ||
		    modify_type == WT_UPDATE_TOMBSTONE) {
			/*
			 * Fixed-size column-store doesn't have on-page deleted
			 * values, it's a nul byte.
			 */
			if (modify_type == WT_UPDATE_TOMBSTONE &&
			    btree->type == BTREE_COL_FIX) {
				modify_type = WT_UPDATE_STANDARD;
				value = &col_fix_remove;
			}
		}

		/*
		 * There's a chance the application specified a record past the
		 * last record on the page. If that's the case and we're
		 * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
		 * append list, not the update list. Also, an out-of-band recno
		 * implies an append operation, we're allocating a new row.
		 * Ignore any information obtained from the search.
		 */
		WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0);
		if (cbt->compare != 0 &&
		    (recno == WT_RECNO_OOB ||
		    recno > (btree->type == BTREE_COL_VAR ?
		    __col_var_last_recno(cbt->ref) :
		    __col_fix_last_recno(cbt->ref)))) {
			append = true;
			cbt->ins = NULL;
			cbt->ins_head = NULL;
		}
	}

	/* We're going to modify the page, we should have loaded history. */
	WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO);

	/* If we don't yet have a modify structure, we'll need one. */
	WT_RET(__wt_page_modify_init(session, page));
	mod = page->modify;

	/*
	 * If modifying a record not previously modified, but which is in the
	 * same update slot as a previously modified record, cursor.ins will
	 * not be set because there's no list of update records for this recno,
	 * but cursor.ins_head will be set to point to the correct update slot.
	 * Acquire the necessary insert information, then create a new update
	 * entry and link it into the existing list. We get here if a page has
	 * a single cell representing multiple records (the records have the
	 * same value), and then a record in the cell is updated or removed,
	 * creating the update list for the cell, and then a cursor iterates
	 * into that same cell to update/remove a different record. We find the
	 * correct slot in the update array, but we don't find an update list
	 * (because it doesn't exist), and don't have the information we need
	 * to do the insert. Normally, we wouldn't care (we could fail and do
	 * a search for the record which would configure everything for the
	 * insert), but range truncation does this pattern for every record in
	 * the cell, and the performance is terrible. For that reason, catch it
	 * here.
	 */
	if (cbt->ins == NULL && cbt->ins_head != NULL) {
		cbt->ins = __col_insert_search(
		    cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
		if (cbt->ins != NULL) {
			if (WT_INSERT_RECNO(cbt->ins) == recno)
				cbt->compare = 0;
			else {
				/*
				 * The test below is for cursor.compare set to 0
				 * and cursor.ins set: cursor.compare wasn't set
				 * by the search we just did, and has an unknown
				 * value. Clear cursor.ins to avoid the test.
				 */
				cbt->ins = NULL;
			}
		}
	}

	/*
	 * Delete, insert or update a column-store entry.
	 *
	 * If modifying a previously modified record, cursor.ins will be set to
	 * point to the correct update list. Create a new update entry and link
	 * it into the existing list.
	 *
	 * Else, allocate an insert array as necessary, build an insert/update
	 * structure pair, and link it into place.
	 */
	if (cbt->compare == 0 && cbt->ins != NULL) {
		/*
		 * If we are restoring updates that couldn't be evicted, the
		 * key must not exist on the new page.
		 */
		WT_ASSERT(session, upd_arg == NULL);

		/* Make sure the update can proceed. */
		WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd));

		/* Allocate a WT_UPDATE structure and transaction ID. */
		WT_ERR(__wt_update_alloc(session,
		    value, &upd, &upd_size, modify_type));
		WT_ERR(__wt_txn_modify(session, upd));
		logged = true;

		/* Avoid a data copy in WT_CURSOR.update. */
		cbt->modify_update = upd;

		/*
		 * Point the new WT_UPDATE item to the next element in the list.
		 * If we get it right, the serialization function lock acts as
		 * our memory barrier to flush this write.
		 */
		upd->next = old_upd;

		/* Serialize the update. */
		WT_ERR(__wt_update_serial(
		    session, page, &cbt->ins->upd, &upd, upd_size, false));
	} else {
		/* Allocate the append/update list reference as necessary. */
		if (append) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, mod->mod_col_append, ins_headp, 1);
			ins_headp = &mod->mod_col_append[0];
		} else if (page->type == WT_PAGE_COL_FIX) {
			WT_PAGE_ALLOC_AND_SWAP(session,
			    page, mod->mod_col_update, ins_headp, 1);
			ins_headp = &mod->mod_col_update[0];
		} else {
			WT_PAGE_ALLOC_AND_SWAP(session, page,
			    mod->mod_col_update, ins_headp, page->entries);
			ins_headp = &mod->mod_col_update[cbt->slot];
		}

		/* Allocate the WT_INSERT_HEAD structure as necessary. */
		WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
		ins_head = *ins_headp;

		/* Choose a skiplist depth for this insert. */
		skipdepth = __wt_skip_choose_depth(session);

		/*
		 * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
		 * update the cursor to reference it (the WT_INSERT_HEAD might
		 * be allocated, the WT_INSERT was allocated).
		 */
		WT_ERR(__col_insert_alloc(
		    session, recno, skipdepth, &ins, &ins_size));
		cbt->ins_head = ins_head;
		cbt->ins = ins;

		/*
		 * Check for insert split and checkpoint races in column-store:
		 * it's easy (as opposed to in row-store) and a difficult bug to
		 * otherwise diagnose.
		 */
		WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB ||
		    (recno != WT_RECNO_OOB &&
		    mod->mod_col_split_recno > recno));

		if (upd_arg == NULL) {
			WT_ERR(__wt_update_alloc(session,
			    value, &upd, &upd_size, modify_type));
			WT_ERR(__wt_txn_modify(session, upd));
			logged = true;

			/* Avoid a data copy in WT_CURSOR.update. */
			cbt->modify_update = upd;
		} else
			upd_size = __wt_update_list_memsize(upd);
		ins->upd = upd;
		ins_size += upd_size;

		/*
		 * If there was no insert list during the search, or there was
		 * no search because the record number has not been allocated
		 * yet, the cursor's information cannot be correct, search
		 * couldn't have initialized it.
		 *
		 * Otherwise, point the new WT_INSERT item's skiplist to the
		 * next elements in the insert list (which we will check are
		 * still valid inside the serialization function).
		 *
		 * The serial mutex acts as our memory barrier to flush these
		 * writes before inserting them into the list.
		 */
		if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
			for (i = 0; i < skipdepth; i++) {
				cbt->ins_stack[i] = &ins_head->head[i];
				ins->next[i] = cbt->next_stack[i] = NULL;
			}
		else
			for (i = 0; i < skipdepth; i++)
				ins->next[i] = cbt->next_stack[i];

		/* Append or insert the WT_INSERT structure. */
		if (append)
			WT_ERR(__wt_col_append_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, &cbt->recno, skipdepth, exclusive));
		else
			WT_ERR(__wt_insert_serial(
			    session, page, cbt->ins_head, cbt->ins_stack,
			    &ins, ins_size, skipdepth, exclusive));

	}

	/* If the update was successful, add it to the in-memory log. */
	if (logged && modify_type != WT_UPDATE_RESERVE) {
		WT_ERR(__wt_txn_log_op(session, cbt));

		/*
		 * In case of append, the recno (key) for the value is assigned
		 * now. Set the recno in the transaction operation to be used
		 * incase this transaction is prepared to retrieve the update
		 * corresponding to this operation.
		 */
		__wt_txn_op_set_recno(session, cbt->recno);
	}

	if (0) {
err:		/*
		 * Remove the update from the current transaction, so we don't
		 * try to modify it on rollback.
		 */
		if (logged)
			__wt_txn_unmodify(session);
		__wt_free(session, ins);
		if (upd_arg == NULL)
			__wt_free(session, upd);
	}

	return (ret);
}