/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* NOTREACHED */ }
/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; break; } return (0); }
/* * __cursor_invalid -- * Return if the cursor references an invalid K/V pair (either the pair * doesn't exist at all because the tree is empty, or the pair was deleted). */ static inline int __cursor_invalid(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; btree = cbt->btree; ins = cbt->ins; page = cbt->page; session = (WT_SESSION_IMPL *)cbt->iface.session; /* If we found an item on an insert list, check there. */ if (ins != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL) return (WT_UPDATE_DELETED_ISSET(upd) ? 1 : 0); /* The page may be empty, the search routine doesn't check. */ if (page->entries == 0) return (1); /* Otherwise, check for an update in the page's slots. */ switch (btree->type) { case BTREE_COL_FIX: break; case BTREE_COL_VAR: cip = &page->u.col_var.d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL) return (WT_NOTFOUND); __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) return (1); break; case BTREE_ROW: if (page->u.row.upd != NULL && (upd = __wt_txn_read(session, page->u.row.upd[cbt->slot])) != NULL && WT_UPDATE_DELETED_ISSET(upd)) return (1); break; } return (0); }
/* * __txn_op_log -- * Log an operation for the current transaction. */ static int __txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_ITEM key, value; WT_UPDATE *upd; uint64_t recno; WT_CLEAR(key); upd = op->u.upd; value.data = WT_UPDATE_DATA(upd); value.size = upd->size; /* * Log the operation. It must be one of the following: * 1) column store remove; * 2) column store insert/update; * 3) row store remove; or * 4) row store insert/update. */ if (cbt->btree->type == BTREE_ROW) { WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); if (WT_UPDATE_DELETED_ISSET(upd)) WT_ERR(__wt_logop_row_remove_pack(session, logrec, op->fileid, &key)); else WT_ERR(__wt_logop_row_put_pack(session, logrec, op->fileid, &key, &value)); } else { recno = WT_INSERT_RECNO(cbt->ins); WT_ASSERT(session, recno != WT_RECNO_OOB); if (WT_UPDATE_DELETED_ISSET(upd)) WT_ERR(__wt_logop_col_remove_pack(session, logrec, op->fileid, recno)); else WT_ERR(__wt_logop_col_put_pack(session, logrec, op->fileid, recno, &value)); } err: __wt_buf_free(session, &key); return (ret); }
/* * __stat_page_col_var -- * Stat a WT_PAGE_COL_VAR page. */ static int __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; uint32_t i; int orig_deleted; unpack = &_unpack; WT_STAT_INCR(stats, btree_column_variable); /* * Walk the page, counting regular and overflow data items, and checking * to be sure any updates weren't deletions. If the item was updated, * assume it was updated by an item of the same size (it's expensive to * figure out if it will require the same space or not, especially if * there's Huffman encoding). */ WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = 1; WT_STAT_INCR(stats, btree_column_deleted); } else { orig_deleted = 0; __wt_cell_unpack(cell, unpack); WT_STAT_INCRV( stats, btree_entries, __wt_cell_rle(unpack)); } /* * Walk the insert list, checking for changes. For each insert * we find, correct the original count based on its state. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; if (WT_UPDATE_DELETED_ISSET(upd)) { if (orig_deleted) continue; WT_STAT_INCR(stats, btree_column_deleted); WT_STAT_DECR(stats, btree_entries); } else { if (!orig_deleted) continue; WT_STAT_DECR(stats, btree_column_deleted); WT_STAT_INCR(stats, btree_entries); } } }
/* * __txn_op_log -- * Log an operation for the current transaction. */ static int __txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op) { WT_ITEM value; uint64_t recno; value.data = WT_UPDATE_DATA(op->u.op.upd); value.size = op->u.op.upd->size; /* * Log the operation. It must be one of the following: * 1) column store remove; * 2) column store insert/update; * 3) row store remove; or * 4) row store insert/update. */ if (op->u.op.key.data == NULL) { WT_ASSERT(session, op->u.op.ins != NULL); recno = op->u.op.ins->u.recno; if (WT_UPDATE_DELETED_ISSET(op->u.op.upd)) WT_RET(__wt_logop_col_remove_pack(session, logrec, op->fileid, recno)); else WT_RET(__wt_logop_col_put_pack(session, logrec, op->fileid, recno, &value)); } else { if (WT_UPDATE_DELETED_ISSET(op->u.op.upd)) WT_RET(__wt_logop_row_remove_pack(session, logrec, op->fileid, &op->u.op.key)); else WT_RET(__wt_logop_row_put_pack(session, logrec, op->fileid, &op->u.op.key, &value)); } return (0); }
/* * __wt_update_obsolete_free -- * Free an obsolete update list. */ void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) { WT_UPDATE *next; size_t size; /* Free a WT_UPDATE list. */ for (size = 0; upd != NULL; upd = next) { /* Deleted items have a dummy size: don't include that. */ size += sizeof(WT_UPDATE) + (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size); next = upd->next; __wt_free(session, upd); } if (size != 0) __wt_cache_page_inmem_decr(session, page, size); }
/*在append list上移动variable-length类型的btree cursor*/ static inline int __cursor_var_append_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage){ cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_page; } for (;;){ cbt->ins = WT_SKIP_NEXT(cbt->ins); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); /*事务隔离读,对本事务不可见,继续向前*/ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; /*删除集合,不做指向这条记录,继续向下移动*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } /*赋值value*/ val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } }
/* * __cursor_valid -- * Return if the cursor references an valid key/value pair. */ static inline bool __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; WT_CELL *cell; WT_COL *cip; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; btree = cbt->btree; page = cbt->ref->page; session = (WT_SESSION_IMPL *)cbt->iface.session; if (updp != NULL) *updp = NULL; /* * We may be pointing to an insert object, and we may have a page with * existing entries. Insert objects always have associated update * objects (the value). Any update object may be deleted, or invisible * to us. In the case of an on-page entry, there is by definition a * value that is visible to us, the original page cell. * * If we find a visible update structure, return our caller a reference * to it because we don't want to repeatedly search for the update, it * might suddenly become invisible (imagine a read-uncommitted session * with another session's aborted insert), and we don't want to handle * that potential error every time we look at the value. * * Unfortunately, the objects we might have and their relationships are * different for the underlying page types. * * In the case of row-store, an insert object implies ignoring any page * objects, no insert object can have the same key as an on-page object. * For row-store: * if there's an insert object: * if there's a visible update: * exact match * else * no exact match * else * use the on-page object (which may have an associated * update object that may or may not be visible to us). * * Column-store is more complicated because an insert object can have * the same key as an on-page object: updates to column-store rows * are insert/object pairs, and an invisible update isn't the end as * there may be an on-page object that is visible. This changes the * logic to: * if there's an insert object: * if there's a visible update: * exact match * else if the on-page object's key matches the insert key * use the on-page object * else * use the on-page object * * First, check for an insert object with a visible update (a visible * update that's been deleted is not a valid key/value pair). */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) return (false); if (updp != NULL) *updp = upd; return (true); } /* * If we don't have an insert object, or in the case of column-store, * there's an insert object but no update was visible to us and the key * on the page is the same as the insert object's key, and the slot as * set by the search function is valid, we can use the original page * information. */ switch (btree->type) { case BTREE_COL_FIX: /* * If search returned an insert object, there may or may not be * a matching on-page object, we have to check. Fixed-length * column-store pages don't have slots, but map one-to-one to * keys, check for retrieval past the end of the page. */ if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries) return (false); /* * An update would have appeared as an "insert" object; no * further checks to do. */ break; case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ if (page->pg_var_entries == 0) return (false); WT_ASSERT(session, cbt->slot < page->pg_var_entries); /* * Column-store updates are stored as "insert" objects. If * search returned an insert object we can't return, the * returned on-page object must be checked for a match. */ if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) return (false); /* * Although updates would have appeared as an "insert" objects, * variable-length column store deletes are written into the * backing store; check the cell for a record already deleted * when read. */ cip = &page->pg_var_d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) return (false); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ if (page->pg_row_entries == 0) return (false); WT_ASSERT(session, cbt->slot < page->pg_row_entries); /* * See above: for row-store, no insert object can have the same * key as an on-page object, we're done. */ if (cbt->ins != NULL) return (false); /* Check for an update. */ if (page->modify != NULL && page->modify->mod_row_update != NULL && (upd = __wt_txn_read(session, page->modify->mod_row_update[cbt->slot])) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) return (false); if (updp != NULL) *updp = upd; } break; } return (true); }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) continue; /* * There can be huge gaps in the variable-length * column-store name space appearing as deleted * records. If more than one deleted record, do * the work of finding the next record to return * instead of looping through the records. * * First, find the largest record in the update * list that's smaller than the current record. */ ins = __col_insert_search_lt( cbt->ins_head, cbt->recno); /* * Second, for records with RLEs greater than 1, * the above call to __col_var_search located * this record in the page's list of repeating * records, and returned the starting record. * The starting record - 1 is the record to * which we could skip, if there was no larger * record in the update list. */ cbt->recno = rle_start - 1; if (ins != NULL && WT_INSERT_RECNO(ins) > cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop decrement. */ ++cbt->recno; continue; } WT_RET(__wt_page_cell_data_ref( session, page, &unpack, cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp->data; val->size = cbt->tmp->size; return (0); } /* NOTREACHED */ }
/* * __cursor_row_prev -- * Move to the previous row-store item. */ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { /* * If we haven't instantiated keys on this page, do so, else it * is a very, very slow traversal. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); if (page->pg_row_entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; goto new_insert; } /* Move to the previous entry and return the item. */ for (;;) { /* * Continue traversing any insert list. Maintain the reference * to the current insert element in case we switch to a cursor * next movement. */ if (cbt->ins != NULL) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the beginning of the page. */ if (cbt->row_iteration_slot == 1) return (WT_NOTFOUND); --cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = cbt->row_iteration_slot == 1 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) continue; WT_RET(__wt_page_cell_data_ref( session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __cursor_row_next -- * Move to the next row-store item. */ static inline int __cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; key = &cbt->iface.key; val = &cbt->iface.value; session = (WT_SESSION_IMPL *)cbt->iface.session; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; goto new_insert; } /* Move to the next entry and return the item. */ for (;;) { /* * Continue traversing any insert list; maintain the insert list * head reference and entry count in case we switch to a cursor * previous movement. */ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the end of the page. */ if (cbt->row_iteration_slot >= cbt->page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT( cbt->page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &cbt->page->u.row.d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(cbt->page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) continue; return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/*移向行存储的下一个行对象*/ static inline int __cursor_row_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /*假如是newpage,定位到insert修改队列的头位置*/ if (newpage){ cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; goto new_insert; } for (;;){ if (cbt->ins != NULL) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { /*事务可见数据读取*/ if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; /*判断是否删除,如果删除,跳过被删除的对象*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } /*检索page row entires数组, 到了page的末尾*/ if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; /*计算定位slot*/ cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } return __cursor_row_slot_return(cbt, rip, upd); } }
/*移向下条variable-length column-store 记录*/ static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle, rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ if (newpage){ cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_var_recno); goto new_page; } for (;;){ if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*定位到recno对应的WT_COL slot*/ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /*读取内容值*/ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. * upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值 */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) continue; /*定位到修改列表中的记录*/ ins = __col_insert_search_gt(cbt->ins_head, cbt->recno); cbt->recno = rle_start + rle; if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop increment. */ --cbt->recno; continue; } /*取出cell中的值到tmp中*/ WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return 0; } }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t *recnop; session = (WT_SESSION_IMPL *)cbt->iface.session; recnop = &cbt->iface.recno; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); cbt->recno = cbt->last_standard_recno; goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { --cbt->recno; new_page: *recnop = cbt->recno; if (cbt->recno < cbt->page->u.col_var.recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); /* Check any insert list for a matching record. */ if ((ins = __col_insert_search_match( WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL && (upd = __wt_txn_read(session, ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) continue; cbt->ins = ins; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); switch (unpack.type) { case WT_CELL_DEL: continue; case WT_CELL_VALUE: if (session->btree->huffman_value == NULL) { cbt->tmp.data = unpack.data; cbt->tmp.size = unpack.size; break; } /* FALLTHROUGH */ default: WT_RET(__wt_cell_unpack_copy( session, &unpack, &cbt->tmp)); } cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_DECL_RET; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < cbt->page->u.col_var.recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(cbt->page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(cbt->page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) continue; /* * Restart for a variable-length column-store. We could * catch restart higher up the call-stack but there's no * point to it: unlike row-store (where a normal search * path finds cached overflow values), we have to access * the page's reconciliation structures, and that's as * easy here as higher up the stack. */ if ((ret = __wt_cell_unpack_ref( session, &unpack, &cbt->tmp)) == WT_RESTART) ret = __wt_ovfl_cache_col_restart( session, cbt->page, &unpack, &cbt->tmp); WT_RET(ret); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return (0); } /* NOTREACHED */ }
/* * __stat_page_col_var -- * Stat a WT_PAGE_COL_VAR page. */ static void __stat_page_col_var( WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; uint32_t i; bool orig_deleted; unpack = &_unpack; deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0; WT_STAT_INCR(session, stats, btree_column_variable); /* * Walk the page counting regular items, adjusting if the item has been * subsequently deleted or not. This is a mess because 10-item RLE might * have 3 of the items subsequently deleted. Overflow items are harder, * we can't know if an updated item will be an overflow item or not; do * our best, and simply count every overflow item (or RLE set of items) * we see. */ WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = true; ++deleted_cnt; } else { orig_deleted = false; __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_ADDR_DEL) orig_deleted = true; else { entry_cnt += __wt_cell_rle(unpack); rle_cnt += __wt_cell_rle(unpack) - 1; } if (unpack->ovfl) ++ovfl_cnt; } /* * Walk the insert list, checking for changes. For each insert * we find, correct the original count based on its state. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; if (WT_UPDATE_DELETED_ISSET(upd)) { if (!orig_deleted) { ++deleted_cnt; --entry_cnt; } } else if (orig_deleted) { --deleted_cnt; ++entry_cnt; } } }