/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; break; } return (0); }
/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* NOTREACHED */ }
/* * __wt_update_alloc -- * Allocate a WT_UPDATE structure and associated value and fill it in. */ int __wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) { WT_UPDATE *upd; size_t size; /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ size = value == NULL ? 0 : value->size; WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); if (value == NULL) WT_UPDATE_DELETED_SET(upd); else { upd->size = WT_STORE_SIZE(size); memcpy(WT_UPDATE_DATA(upd), value->data, size); } *updp = upd; if (sizep != NULL) *sizep = sizeof(WT_UPDATE) + size; return (0); }
/*btree cursor移向下一个记录,仅仅在append list上移动*/ static inline int __cursor_fix_append_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; /*新载入的page,判断ins_head是否为空,如果为空表示没有append的记录*/ if (newpage){ if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else{ /*已经到append list的最后一条记录了,后面没有记录*/ if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) && (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL) return (WT_NOTFOUND); } /* * This code looks different from the cursor-previous code. The append * list appears on the last page of the tree, but it may be preceded by * other rows, which means the cursor's recno will be set to a value and * we simply want to increment it. If the cursor's recno is NOT set, * we're starting our iteration in a tree that has only appended items. * In that case, recno will be 0 and happily enough the increment will * set it to 1, which is correct. */ __cursor_set_recno(cbt, cbt->recno + 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. * * The problem is that we don't know at this point whether there may be * multiple uncommitted changes at the end of the data, and it would be * expensive to check every time we hit an aborted update. If an * insert is aborted, we simply return zero (empty), regardless of * whether we are at the end of the data. */ if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL){ /*没有可见的记录值,直接返回0*/ cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return 0; }
/* * __cursor_fix_prev -- * Move to the previous, fixed-length column-store item. */ static inline int __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_BTREE *btree; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = session->btree; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { if (cbt->recno == cbt->page->u.col_fix.recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno - 1); new_page: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SINGLE(cbt->page); cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); } cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt); val->data = &cbt->v; val->size = 1; return (0); } /* NOTREACHED */ }
/*btree cursor移向下一个记录(fix col方式存储),在btree树空间上移动*/ static inline int __cursor_fix_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_BTREE *btree; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; val = &cbt->iface.value; /*切换到新的page上做next操作*/ if (newpage){ cbt->last_standard_recno = __col_fix_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_fix_recno); goto new_page; } /*记录序号超出最后一个序号,到末尾了*/ if (cbt->recno >= cbt->last_standard_recno) return WT_NOTFOUND; __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*获得第一个修改列表*/ cbt->ins_head = WT_COL_UPDATE_SINGLE(page); /*定位recno所在的修改条目*/ cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) /*不正确的修改定位,将ins设置为NULL表示定位失败*/ cbt->ins = NULL; /*做事务隔离读取记录版本*/ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL){ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); /*V赋值*/ val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return 0; }
/* * __cursor_fix_prev -- * Move to the previous, fixed-length column-store item. */ static inline int __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_BTREE *btree; WT_INSERT *ins; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t *recnop; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = session->btree; recnop = &cbt->iface.recno; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); cbt->recno = cbt->last_standard_recno; goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { if (cbt->recno == cbt->page->u.col_fix.recno) return (WT_NOTFOUND); --cbt->recno; new_page: *recnop = cbt->recno; /* Check any insert list for a matching record. */ if ((ins = cbt->ins = __col_insert_search_match( WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL) { val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); } cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt); val->data = &cbt->v; val->size = 1; return (0); } /* NOTREACHED */ }
/* * __txn_op_log -- * Log an operation for the current transaction. */ static int __txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_ITEM key, value; WT_UPDATE *upd; uint64_t recno; WT_CLEAR(key); upd = op->u.upd; value.data = WT_UPDATE_DATA(upd); value.size = upd->size; /* * Log the operation. It must be one of the following: * 1) column store remove; * 2) column store insert/update; * 3) row store remove; or * 4) row store insert/update. */ if (cbt->btree->type == BTREE_ROW) { WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); if (WT_UPDATE_DELETED_ISSET(upd)) WT_ERR(__wt_logop_row_remove_pack(session, logrec, op->fileid, &key)); else WT_ERR(__wt_logop_row_put_pack(session, logrec, op->fileid, &key, &value)); } else { recno = WT_INSERT_RECNO(cbt->ins); WT_ASSERT(session, recno != WT_RECNO_OOB); if (WT_UPDATE_DELETED_ISSET(upd)) WT_ERR(__wt_logop_col_remove_pack(session, logrec, op->fileid, recno)); else WT_ERR(__wt_logop_col_put_pack(session, logrec, op->fileid, recno, &value)); } err: __wt_buf_free(session, &key); return (ret); }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); cbt->recno = WT_INSERT_RECNO(cbt->ins); } else { if (cbt->recno == WT_INSERT_RECNO(cbt->ins)) { __cursor_skip_prev(cbt); if (cbt->ins == NULL) return (WT_NOTFOUND); } --cbt->recno; } /* * Column store appends are inherently non-transactional. * * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. */ cbt->iface.recno = cbt->recno; if (cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); }
/* * __txn_op_log -- * Log an operation for the current transaction. */ static int __txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op) { WT_ITEM value; uint64_t recno; value.data = WT_UPDATE_DATA(op->u.op.upd); value.size = op->u.op.upd->size; /* * Log the operation. It must be one of the following: * 1) column store remove; * 2) column store insert/update; * 3) row store remove; or * 4) row store insert/update. */ if (op->u.op.key.data == NULL) { WT_ASSERT(session, op->u.op.ins != NULL); recno = op->u.op.ins->u.recno; if (WT_UPDATE_DELETED_ISSET(op->u.op.upd)) WT_RET(__wt_logop_col_remove_pack(session, logrec, op->fileid, recno)); else WT_RET(__wt_logop_col_put_pack(session, logrec, op->fileid, recno, &value)); } else { if (WT_UPDATE_DELETED_ISSET(op->u.op.upd)) WT_RET(__wt_logop_row_remove_pack(session, logrec, op->fileid, &op->u.op.key)); else WT_RET(__wt_logop_row_put_pack(session, logrec, op->fileid, &op->u.op.key, &value)); } return (0); }
/*在append list上移动variable-length类型的btree cursor*/ static inline int __cursor_var_append_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage){ cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_page; } for (;;){ cbt->ins = WT_SKIP_NEXT(cbt->ins); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); /*事务隔离读,对本事务不可见,继续向前*/ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; /*删除集合,不做指向这条记录,继续向下移动*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } /*赋值value*/ val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } }
/* * __wt_update_alloc -- * Allocate a WT_UPDATE structure and associated value from the session's * buffer and fill it in. */ int __wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) { WT_DECL_RET; WT_UPDATE *upd; size_t size; /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ size = value == NULL ? 0 : value->size; WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); if (value == NULL) WT_UPDATE_DELETED_SET(upd); else { upd->size = WT_STORE_SIZE(size); memcpy(WT_UPDATE_DATA(upd), value->data, size); } /* * This must come last: after __wt_txn_modify succeeds, we must return * a non-NULL upd so our callers can call __wt_txn_unmodify on any * subsequent failure. */ if ((ret = __wt_txn_modify(session, &upd->txnid)) != 0) { __wt_free(session, upd); return (ret); } *updp = upd; if (sizep != NULL) *sizep = sizeof(WT_UPDATE) + size; return (0); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = S2BT(session); page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else { WT_RET( __wt_row_key(session, page, rip, &cursor->key, 0)); upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value); /* * Restart for a variable-length column-store. We could catch restart * higher up the call-stack but there's no point to it: unlike row-store * (where a normal search path finds cached overflow values), we have to * access the page's reconciliation structures, and that's as easy here * as higher up the stack. */ if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR) ret = __wt_ovfl_cache_col_restart( session, page, &unpack, &cursor->value); return (ret); }
/* * __cursor_row_prev -- * Move to the previous row-store item. */ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { /* * If we haven't instantiated keys on this page, do so, else it * is a very, very slow traversal. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); if (page->pg_row_entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; goto new_insert; } /* Move to the previous entry and return the item. */ for (;;) { /* * Continue traversing any insert list. Maintain the reference * to the current insert element in case we switch to a cursor * next movement. */ if (cbt->ins != NULL) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the beginning of the page. */ if (cbt->row_iteration_slot == 1) return (WT_NOTFOUND); --cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = cbt->row_iteration_slot == 1 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) continue; /* * There can be huge gaps in the variable-length * column-store name space appearing as deleted * records. If more than one deleted record, do * the work of finding the next record to return * instead of looping through the records. * * First, find the largest record in the update * list that's smaller than the current record. */ ins = __col_insert_search_lt( cbt->ins_head, cbt->recno); /* * Second, for records with RLEs greater than 1, * the above call to __col_var_search located * this record in the page's list of repeating * records, and returned the starting record. * The starting record - 1 is the record to * which we could skip, if there was no larger * record in the update list. */ cbt->recno = rle_start - 1; if (ins != NULL && WT_INSERT_RECNO(ins) > cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop decrement. */ ++cbt->recno; continue; } WT_RET(__wt_page_cell_data_ref( session, page, &unpack, cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp->data; val->size = cbt->tmp->size; return (0); } /* NOTREACHED */ }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = session->btree; unpack = &_unpack; page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins == NULL) { if (key_ret) { if (__wt_off_page(page, rip->key)) { ikey = rip->key; cursor->key.data = WT_IKEY_DATA(ikey); cursor->key.size = ikey->size; } else WT_RET(__wt_row_key( session, page, rip, &cursor->key)); } upd = WT_ROW_UPDATE(page, rip); } else { if (key_ret) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } upd = cbt->ins->upd; } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* It's a cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, unpack); if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) { cursor->value.data = unpack->data; cursor->value.size = unpack->size; return (0); } else return (__wt_cell_unpack_copy(session, unpack, &cursor->value)); }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else { /* * Handle the special case of leading implicit records, that is, * there aren't any records in the tree not on the append list, * and the first record on the append list isn't record 1. * * The "right" place to handle this is probably in our caller. * The high-level cursor-previous routine would: * -- call this routine to walk the append list * -- call the routine to walk the standard page items * -- call the tree walk routine looking for a previous page * Each of them returns WT_NOTFOUND, at which point our caller * checks the cursor record number, and if it's larger than 1, * returns the implicit records. Instead, I'm trying to detect * the case here, mostly because I don't want to put that code * into our caller. Anyway, if this code breaks for any reason, * that's the way I'd go. * * If we're not pointing to a WT_INSERT entry, or we can't find * a WT_INSERT record that precedes our record name-space, check * if there are any records on the page. If there aren't, then * we're in the magic zone, keep going until we get to a record * number of 1. */ if (cbt->ins != NULL && cbt->recno <= WT_INSERT_RECNO(cbt->ins)) WT_RET(__cursor_skip_prev(cbt)); if (cbt->ins == NULL && (cbt->recno == 1 || __col_fix_last_recno(page) != 0)) return (WT_NOTFOUND); } /* * This code looks different from the cursor-next code. The append * list appears on the last page of the tree and contains the last * records in the tree. If we're iterating through the tree, starting * at the last record in the tree, by definition we're starting a new * iteration and we set the record number to the last record found in * the tree. Otherwise, decrement the record. */ if (newpage) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); else __cursor_set_recno(cbt, cbt->recno - 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. */ if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) continue; WT_RET(__wt_page_cell_data_ref( session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __cursor_row_next -- * Move to the next row-store item. */ static inline int __cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; key = &cbt->iface.key; val = &cbt->iface.value; session = (WT_SESSION_IMPL *)cbt->iface.session; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; goto new_insert; } /* Move to the next entry and return the item. */ for (;;) { /* * Continue traversing any insert list; maintain the insert list * head reference and entry count in case we switch to a cursor * previous movement. */ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the end of the page. */ if (cbt->row_iteration_slot >= cbt->page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT( cbt->page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &cbt->page->u.row.d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(cbt->page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) continue; return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; uint8_t v; switch (page->type){ case WT_PAGE_COL_FIX: cursor->recno = cbt->recno; /*cursor对应的是一个upd,直接返回value*/ if (upd != NULL){ cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return 0; } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return __wt_buf_set(session, &cursor->value, &v, 1); case WT_PAGE_COL_VAR: cursor->recno = cbt->recno; if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*获得对应的cell,并通过cell得到K/V值*/ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; if (cbt->ins != NULL){ /*插入的k/v对*/ cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/ cursor->key.data = cbt->search_key.data; cursor->key.size = cbt->search_key.size; } else WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/ /*值是在append/update list当中,从当中取*/ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/ if (__wt_row_leaf_value(page, rip, &cursor->value)) return 0; /*不是连续存储的,需要通过解析cell来定位到value*/ if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){ cursor->value.size = 0; return 0; } break; WT_ILLEGAL_VALUE(session); } /*通过cell解析到对应的value值, ovfl item*/ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return 0; }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t *recnop; session = (WT_SESSION_IMPL *)cbt->iface.session; recnop = &cbt->iface.recno; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); cbt->recno = cbt->last_standard_recno; goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { --cbt->recno; new_page: *recnop = cbt->recno; if (cbt->recno < cbt->page->u.col_var.recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); /* Check any insert list for a matching record. */ if ((ins = __col_insert_search_match( WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) continue; cbt->ins = ins; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); switch (unpack.type) { case WT_CELL_DEL: continue; case WT_CELL_VALUE: if (session->btree->huffman_value == NULL) { cbt->tmp.data = unpack.data; cbt->tmp.size = unpack.size; break; } /* FALLTHROUGH */ default: WT_RET(__wt_cell_unpack_copy( session, &unpack, &cbt->tmp)); } cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/*移向行存储的下一个行对象*/ static inline int __cursor_row_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /*假如是newpage,定位到insert修改队列的头位置*/ if (newpage){ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; goto new_insert; } for (;;){ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { /*事务可见数据读取*/ if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; /*判断是否删除,如果删除,跳过被删除的对象*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } /*检索page row entires数组, 到了page的末尾*/ if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; /*计算定位slot*/ cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } return __cursor_row_slot_return(cbt, rip, upd); } }
/*移向下条variable-length column-store 记录*/ static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle, rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ if (newpage){ cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_var_recno); goto new_page; } for (;;){ if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*定位到recno对应的WT_COL slot*/ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /*读取内容值*/ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. * upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值 */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) continue; /*定位到修改列表中的记录*/ ins = __col_insert_search_gt(cbt->ins_head, cbt->recno); cbt->recno = rle_start + rle; if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop increment. */ --cbt->recno; continue; } /*取出cell中的值到tmp中*/ WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return 0; } }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_DECL_RET; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < cbt->page->u.col_var.recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(cbt->page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(cbt->page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) continue; /* * Restart for a variable-length column-store. We could * catch restart higher up the call-stack but there's no * point to it: unlike row-store (where a normal search * path finds cached overflow values), we have to access * the page's reconciliation structures, and that's as * easy here as higher up the stack. */ if ((ret = __wt_cell_unpack_ref( session, &unpack, &cbt->tmp)) == WT_RESTART) ret = __wt_ovfl_cache_col_restart( session, cbt->page, &unpack, &cbt->tmp); WT_RET(ret); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; uint8_t v; btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page. */ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. * Else, if we have an exact match, we copied the key in the * search function, take it from there. * If we don't have an exact match, take the key from the * original page. */ if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want * to return in the cursor's temporary buffer. Swap the * cursor's search-key and temporary buffers so we can * return it (it's unsafe to return the temporary buffer * itself because our caller might do another search in * this table using the key we return, and we'd corrupt * the search key during any subsequent search that used * the temporary buffer. */ tmp = cbt->row_key; cbt->row_key = cbt->tmp; cbt->tmp = tmp; cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; } else WT_RET(__wt_row_leaf_key( session, page, rip, &cursor->key, false)); /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) return (0); /* * Take the value from the original page cell (which may be * empty). */ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return (0); }