/* * __txn_abort_newer_row_leaf -- * Abort updates on a row leaf page with timestamps newer than the * rollback timestamp. */ static void __txn_abort_newer_row_leaf( WT_SESSION_IMPL *session, WT_PAGE *page, wt_timestamp_t *rollback_timestamp) { WT_INSERT_HEAD *insert; WT_ROW *rip; WT_UPDATE *upd; uint32_t i; /* * Review the insert list for keys before the first entry on the disk * page. */ if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) __txn_abort_newer_row_skip( session, insert, rollback_timestamp); /* * Review updates that belong to keys that are on the disk image, * as well as for keys inserted since the page was read from disk. */ WT_ROW_FOREACH(page, rip, i) { if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) __txn_abort_newer_update( session, upd, rollback_timestamp); if ((insert = WT_ROW_INSERT(page, rip)) != NULL) __txn_abort_newer_row_skip( session, insert, rollback_timestamp); } }
/* * __wt_page_obsolete -- * Discard all obsolete updates on a row-store leaf page. */ void __wt_row_leaf_obsolete(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_INSERT *ins; WT_ROW *rip; WT_UPDATE *upd; uint32_t i; /* For entries before the first on-page record... */ WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page)) if ((upd = __wt_update_obsolete_check(session, ins->upd)) != NULL) __wt_update_obsolete_free(session, page, upd); /* For each entry on the page... */ WT_ROW_FOREACH(page, rip, i) { if ((upd = __wt_update_obsolete_check( session, WT_ROW_UPDATE(page, rip))) != NULL) __wt_update_obsolete_free(session, page, upd); WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip)) if ((upd = __wt_update_obsolete_check( session, ins->upd)) != NULL) __wt_update_obsolete_free(session, page, upd); } }
/* * __wt_btcur_iterate_setup -- * Initialize a cursor for iteration, usually based on a search. */ void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt, int next) { WT_PAGE *page; WT_UNUSED(next); /* * We don't currently have to do any setup when we switch between next * and prev calls, but I'm sure we will someday -- I'm leaving support * here for both flags for that reason. */ F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); /* * Clear the count of deleted items on the page. */ cbt->page_deleted_count = 0; /* * If we don't have a search page, then we're done, we're starting at * the beginning or end of the tree, not as a result of a search. */ if (cbt->ref == NULL) return; page = cbt->ref->page; if (page->type == WT_PAGE_ROW_LEAF) { /* * For row-store pages, we need a single item that tells us the * part of the page we're walking (otherwise switching from next * to prev and vice-versa is just too complicated), so we map * the WT_ROW and WT_INSERT_HEAD insert array slots into a * single name space: slot 1 is the "smallest key insert list", * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. * This means WT_INSERT lists are odd-numbered slots, and WT_ROW * array slots are even-numbered slots. */ cbt->row_iteration_slot = (cbt->slot + 1) * 2; if (cbt->ins_head != NULL) { if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page)) cbt->row_iteration_slot = 1; else cbt->row_iteration_slot += 1; } } else { /* * For column-store pages, calculate the largest record on the * page. */ cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page); /* If we're traversing the append list, set the reference. */ if (cbt->ins_head != NULL && cbt->ins_head == WT_COL_APPEND(page)) F_SET(cbt, WT_CBT_ITERATE_APPEND); } }
/*为一个检索操作初始化一个查询的cursor*/ void __wt_btcur_iterate_setup(WT_CURSOR_BTREE* cbt, int next) { WT_PAGE *page; WT_UNUSED(next); /* * We don't currently have to do any setup when we switch between next * and prev calls, but I'm sure we will someday -- I'm leaving support * here for both flags for that reason. */ F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); /*初始化del count计数器,在检索的过程需要统计del count*/ cbt->page_deleted_count = 0; if (cbt->ref == NULL) return; page = cbt->ref->page; /*行存储叶子节点*/ if (page->type == WT_PAGE_ROW_LEAF){ /*指定iteration slot*/ /* * For row-store pages, we need a single item that tells us the * part of the page we're walking (otherwise switching from next * to prev and vice-versa is just too complicated), so we map * the WT_ROW and WT_INSERT_HEAD insert array slots into a * single name space: slot 1 is the "smallest key insert list", * slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. * This means WT_INSERT lists are odd-numbered slots, and WT_ROW * array slots are even-numbered slots. */ cbt->row_iteration_slot = (cbt->slot + 1) * 2; if (cbt->ins_head != NULL){ if (cbt->ins_head == WT_ROW_INSERT_SMALLEST(page)) cbt->row_iteration_slot = 1; else cbt->row_iteration_slot += 1; } } else{ /*column store page, 计算这个page中最大的记录序号*/ cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page); if (cbt->ins_head != NULL && cbt->ins_head == WT_COL_APPEND(page)) F_SET(cbt, WT_CBT_ITERATE_APPEND); } }
/* * __cursor_row_prev -- * Move to the previous row-store item. */ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { /* * If we haven't instantiated keys on this page, do so, else it * is a very, very slow traversal. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); if (page->pg_row_entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; goto new_insert; } /* Move to the previous entry and return the item. */ for (;;) { /* * Continue traversing any insert list. Maintain the reference * to the current insert element in case we switch to a cursor * next movement. */ if (cbt->ins != NULL) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the beginning of the page. */ if (cbt->row_iteration_slot == 1) return (WT_NOTFOUND); --cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = cbt->row_iteration_slot == 1 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/* * __cursor_row_next -- * Move to the next row-store item. */ static inline int __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * Initialize for each new page. */ if (newpage) { cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; cbt->rip_saved = NULL; goto new_insert; } /* Move to the next entry and return the item. */ for (;;) { /* * Continue traversing any insert list; maintain the insert list * head reference and entry count in case we switch to a cursor * previous movement. */ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_upd_visible_all(session, upd)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); return (__wt_value_return(session, cbt, upd)); } /* Check for the end of the page. */ if (cbt->row_iteration_slot >= page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && upd->type == WT_UPDATE_DELETED) { if (__wt_txn_upd_visible_all(session, upd)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/*移向行存储的下一个行对象*/ static inline int __cursor_row_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /*假如是newpage,定位到insert修改队列的头位置*/ if (newpage){ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; goto new_insert; } for (;;){ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { /*事务可见数据读取*/ if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; /*判断是否删除,如果删除,跳过被删除的对象*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } /*检索page row entires数组, 到了page的末尾*/ if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; /*计算定位slot*/ cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } return __cursor_row_slot_return(cbt, rip, upd); } }