/* * __cursor_fix_prev -- * Move to the previous, fixed-length column-store item. */ static inline int __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_BTREE *btree; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = session->btree; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { if (cbt->recno == cbt->page->u.col_fix.recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno - 1); new_page: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SINGLE(cbt->page); cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); } cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt); val->data = &cbt->v; val->size = 1; return (0); } /* NOTREACHED */ }
/*btree cursor移向下一个记录(fix col方式存储),在btree树空间上移动*/ static inline int __cursor_fix_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_BTREE *btree; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; val = &cbt->iface.value; /*切换到新的page上做next操作*/ if (newpage){ cbt->last_standard_recno = __col_fix_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_fix_recno); goto new_page; } /*记录序号超出最后一个序号,到末尾了*/ if (cbt->recno >= cbt->last_standard_recno) return WT_NOTFOUND; __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*获得第一个修改列表*/ cbt->ins_head = WT_COL_UPDATE_SINGLE(page); /*定位recno所在的修改条目*/ cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) /*不正确的修改定位,将ins设置为NULL表示定位失败*/ cbt->ins = NULL; /*做事务隔离读取记录版本*/ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL){ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); /*V赋值*/ val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return 0; }
/* * __cursor_fix_prev -- * Move to the previous, fixed-length column-store item. */ static inline int __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_BTREE *btree; WT_INSERT *ins; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t *recnop; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = session->btree; recnop = &cbt->iface.recno; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); cbt->recno = cbt->last_standard_recno; goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { if (cbt->recno == cbt->page->u.col_fix.recno) return (WT_NOTFOUND); --cbt->recno; new_page: *recnop = cbt->recno; /* Check any insert list for a matching record. */ if ((ins = cbt->ins = __col_insert_search_match( WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL) { val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); } cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt); val->data = &cbt->v; val->size = 1; return (0); } /* NOTREACHED */ }
/* * __cursor_fix_next -- * Move to the next, fixed-length column-store item. */ static inline int __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_BTREE *btree; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_fix_last_recno(cbt->ref); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->ref->ref_recno); goto new_page; } /* Move to the next entry and return the item. */ if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno + 1); new_page: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SINGLE(page); cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; } else cbt->iface.value.data = upd->data; cbt->iface.value.size = 1; return (0); }
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; uint8_t v; switch (page->type){ case WT_PAGE_COL_FIX: cursor->recno = cbt->recno; /*cursor对应的是一个upd,直接返回value*/ if (upd != NULL){ cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return 0; } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return __wt_buf_set(session, &cursor->value, &v, 1); case WT_PAGE_COL_VAR: cursor->recno = cbt->recno; if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*获得对应的cell,并通过cell得到K/V值*/ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; if (cbt->ins != NULL){ /*插入的k/v对*/ cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/ cursor->key.data = cbt->search_key.data; cursor->key.size = cbt->search_key.size; } else WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/ /*值是在append/update list当中,从当中取*/ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/ if (__wt_row_leaf_value(page, rip, &cursor->value)) return 0; /*不是连续存储的,需要通过解析cell来定位到value*/ if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){ cursor->value.size = 0; return 0; } break; WT_ILLEGAL_VALUE(session); } /*通过cell解析到对应的value值, ovfl item*/ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return 0; }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; uint8_t v; btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page. */ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. * Else, if we have an exact match, we copied the key in the * search function, take it from there. * If we don't have an exact match, take the key from the * original page. */ if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want * to return in the cursor's temporary buffer. Swap the * cursor's search-key and temporary buffers so we can * return it (it's unsafe to return the temporary buffer * itself because our caller might do another search in * this table using the key we return, and we'd corrupt * the search key during any subsequent search that used * the temporary buffer. */ tmp = cbt->row_key; cbt->row_key = cbt->tmp; cbt->tmp = tmp; cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; } else WT_RET(__wt_row_leaf_key( session, page, rip, &cursor->key, false)); /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) return (0); /* * Take the value from the original page cell (which may be * empty). */ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return (0); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = session->btree; unpack = &_unpack; page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins == NULL) { if (key_ret) { if (__wt_off_page(page, rip->key)) { ikey = rip->key; cursor->key.data = WT_IKEY_DATA(ikey); cursor->key.size = ikey->size; } else WT_RET(__wt_row_key( session, page, rip, &cursor->key)); } upd = WT_ROW_UPDATE(page, rip); } else { if (key_ret) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } upd = cbt->ins->upd; } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* It's a cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, unpack); if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) { cursor->value.data = unpack->data; cursor->value.size = unpack->size; return (0); } else return (__wt_cell_unpack_copy(session, unpack, &cursor->value)); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = S2BT(session); page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else { WT_RET( __wt_row_key(session, page, rip, &cursor->key, 0)); upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value); /* * Restart for a variable-length column-store. We could catch restart * higher up the call-stack but there's no point to it: unlike row-store * (where a normal search path finds cached overflow values), we have to * access the page's reconciliation structures, and that's as easy here * as higher up the stack. */ if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR) ret = __wt_ovfl_cache_col_restart( session, page, &unpack, &cursor->value); return (ret); }