/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; break; } return (0); }
/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* NOTREACHED */ }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); cbt->recno = WT_INSERT_RECNO(cbt->ins); } else { if (cbt->recno == WT_INSERT_RECNO(cbt->ins)) { __cursor_skip_prev(cbt); if (cbt->ins == NULL) return (WT_NOTFOUND); } --cbt->recno; } /* * Column store appends are inherently non-transactional. * * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. */ cbt->iface.recno = cbt->recno; if (cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); }
/* * __cursor_row_prev -- * Move to the previous row-store item. */ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { /* * If we haven't instantiated keys on this page, do so, else it * is a very, very slow traversal. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); if (page->pg_row_entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; goto new_insert; } /* Move to the previous entry and return the item. */ for (;;) { /* * Continue traversing any insert list. Maintain the reference * to the current insert element in case we switch to a cursor * next movement. */ if (cbt->ins != NULL) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the beginning of the page. */ if (cbt->row_iteration_slot == 1) return (WT_NOTFOUND); --cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = cbt->row_iteration_slot == 1 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else { /* * Handle the special case of leading implicit records, that is, * there aren't any records in the tree not on the append list, * and the first record on the append list isn't record 1. * * The "right" place to handle this is probably in our caller. * The high-level cursor-previous routine would: * -- call this routine to walk the append list * -- call the routine to walk the standard page items * -- call the tree walk routine looking for a previous page * Each of them returns WT_NOTFOUND, at which point our caller * checks the cursor record number, and if it's larger than 1, * returns the implicit records. Instead, I'm trying to detect * the case here, mostly because I don't want to put that code * into our caller. Anyway, if this code breaks for any reason, * that's the way I'd go. * * If we're not pointing to a WT_INSERT entry, or we can't find * a WT_INSERT record that precedes our record name-space, check * if there are any records on the page. If there aren't, then * we're in the magic zone, keep going until we get to a record * number of 1. */ if (cbt->ins != NULL && cbt->recno <= WT_INSERT_RECNO(cbt->ins)) WT_RET(__cursor_skip_prev(cbt)); if (cbt->ins == NULL && (cbt->recno == 1 || __col_fix_last_recno(page) != 0)) return (WT_NOTFOUND); } /* * This code looks different from the cursor-next code. The append * list appears on the last page of the tree and contains the last * records in the tree. If we're iterating through the tree, starting * at the last record in the tree, by definition we're starting a new * iteration and we set the record number to the last record found in * the tree. Otherwise, decrement the record. */ if (newpage) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); else __cursor_set_recno(cbt, cbt->recno - 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. */ if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else { /* Move to the previous record in the append list, if any. */ if (cbt->ins != NULL && cbt->recno <= WT_INSERT_RECNO(cbt->ins)) WT_RET(__cursor_skip_prev(cbt)); /* * Handle the special case of leading implicit records, that is, * there aren't any records in the page not on the append list, * and the append list's first record isn't the first record on * the page. (Although implemented as a test of the page values, * this is really a test for a tree where the first inserted * record wasn't record 1, any other page with only an append * list will have a first page record number matching the first * record in the append list.) * * The "right" place to handle this is probably in our caller. * The high-level cursor-previous routine would: * -- call this routine to walk the append list * -- call the routine to walk the standard page items * -- call the tree walk routine looking for a previous page * Each of them returns WT_NOTFOUND, at which point our caller * checks the cursor record number, and if it's larger than 1, * returns the implicit records. Instead, I'm trying to detect * the case here, mostly because I don't want to put that code * into our caller. Anyway, if this code breaks for any reason, * that's the way I'd go. * * If we're not pointing to a WT_INSERT entry (we didn't find a * WT_INSERT record preceding our record name-space), check if * we've reached the beginning of this page, a possibility if a * page had a large number of items appended, and then split. * If not, check if there are any records on the page. If there * aren't, then we're in the magic zone, keep going until we get * to a record number matching the first record on the page. */ if (cbt->ins == NULL && (cbt->recno == cbt->ref->ref_recno || __col_fix_last_recno(cbt->ref) != 0)) return (WT_NOTFOUND); } /* * This code looks different from the cursor-next code. The append list * may be preceded by other rows. If we're iterating through the tree, * starting at the last record in the tree, by definition we're starting * a new iteration and we set the record number to the last record found * on the page. Otherwise, decrement the record. */ if (newpage) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); else __cursor_set_recno(cbt, cbt->recno - 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. */ if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; cbt->iface.value.data = &cbt->v; } else cbt->iface.value.data = upd->data; cbt->iface.value.size = 1; return (0); }