/* * __wt_page_obsolete -- * Discard all obsolete updates on a row-store leaf page. */ void __wt_row_leaf_obsolete(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_INSERT *ins; WT_ROW *rip; WT_UPDATE *upd; uint32_t i; /* For entries before the first on-page record... */ WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page)) if ((upd = __wt_update_obsolete_check(session, ins->upd)) != NULL) __wt_update_obsolete_free(session, page, upd); /* For each entry on the page... */ WT_ROW_FOREACH(page, rip, i) { if ((upd = __wt_update_obsolete_check( session, WT_ROW_UPDATE(page, rip))) != NULL) __wt_update_obsolete_free(session, page, upd); WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip)) if ((upd = __wt_update_obsolete_check( session, ins->upd)) != NULL) __wt_update_obsolete_free(session, page, upd); } }
/* * __txn_abort_newer_row_leaf -- * Abort updates on a row leaf page with timestamps newer than the * rollback timestamp. */ static void __txn_abort_newer_row_leaf( WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp) { WT_INSERT_HEAD *insert; WT_ROW *rip; WT_UPDATE *upd; uint32_t i; /* * Review the insert list for keys before the first entry on the disk * page. */ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) __txn_abort_newer_row_skip( session, insert, rollback_timestamp); /* * Review updates that belong to keys that are on the disk image, * as well as for keys inserted since the page was read from disk. */ WT_ROW_FOREACH(page, rip, i) { if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) __txn_abort_newer_update( session, upd, rollback_timestamp); if ((insert = WT_ROW_INSERT(page, rip)) != NULL) __txn_abort_newer_row_skip( session, insert, rollback_timestamp); } }
/* * __ovfl_cache_row_visible -- * row-store: check for a globally visible update. */ static bool __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) { WT_UPDATE *upd; /* Check to see if there's a globally visible update. */ for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next) if (__wt_txn_visible_all(session, upd->txnid)) return (true); return (false); }
/* * __cursor_row_prev -- * Move to the previous row-store item. */ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { /* * If we haven't instantiated keys on this page, do so, else it * is a very, very slow traversal. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); if (page->pg_row_entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; goto new_insert; } /* Move to the previous entry and return the item. */ for (;;) { /* * Continue traversing any insert list. Maintain the reference * to the current insert element in case we switch to a cursor * next movement. */ if (cbt->ins != NULL) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the beginning of the page. */ if (cbt->row_iteration_slot == 1) return (WT_NOTFOUND); --cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = cbt->row_iteration_slot == 1 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/* * __cursor_row_next -- * Move to the next row-store item. */ static inline int __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * Initialize for each new page. */ if (newpage) { cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; cbt->rip_saved = NULL; goto new_insert; } /* Move to the next entry and return the item. */ for (;;) { /* * Continue traversing any insert list; maintain the insert list * head reference and entry count in case we switch to a cursor * previous movement. */ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_upd_visible_all(session, upd)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); return (__wt_value_return(session, cbt, upd)); } /* Check for the end of the page. */ if (cbt->row_iteration_slot >= page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && upd->type == WT_UPDATE_DELETED) { if (__wt_txn_upd_visible_all(session, upd)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/*移向行存储的下一个行对象*/ static inline int __cursor_row_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /*假如是newpage,定位到insert修改队列的头位置*/ if (newpage){ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; goto new_insert; } for (;;){ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { /*事务可见数据读取*/ if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; /*判断是否删除,如果删除,跳过被删除的对象*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } /*检索page row entires数组, 到了page的末尾*/ if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; /*计算定位slot*/ cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } return __cursor_row_slot_return(cbt, rip, upd); } }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = session->btree; unpack = &_unpack; page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins == NULL) { if (key_ret) { if (__wt_off_page(page, rip->key)) { ikey = rip->key; cursor->key.data = WT_IKEY_DATA(ikey); cursor->key.size = ikey->size; } else WT_RET(__wt_row_key( session, page, rip, &cursor->key)); } upd = WT_ROW_UPDATE(page, rip); } else { if (key_ret) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } upd = cbt->ins->upd; } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* It's a cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, unpack); if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) { cursor->value.data = unpack->data; cursor->value.size = unpack->size; return (0); } else return (__wt_cell_unpack_copy(session, unpack, &cursor->value)); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = S2BT(session); page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else { WT_RET( __wt_row_key(session, page, rip, &cursor->key, 0)); upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value); /* * Restart for a variable-length column-store. We could catch restart * higher up the call-stack but there's no point to it: unlike row-store * (where a normal search path finds cached overflow values), we have to * access the page's reconciliation structures, and that's as easy here * as higher up the stack. */ if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR) ret = __wt_ovfl_cache_col_restart( session, page, &unpack, &cursor->value); return (ret); }