/*btree cursor移向下一个记录,仅仅在append list上移动*/ static inline int __cursor_fix_append_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; /*新载入的page,判断ins_head是否为空,如果为空表示没有append的记录*/ if (newpage){ if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else{ /*已经到append list的最后一条记录了,后面没有记录*/ if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) && (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL) return (WT_NOTFOUND); } /* * This code looks different from the cursor-previous code. The append * list appears on the last page of the tree, but it may be preceded by * other rows, which means the cursor's recno will be set to a value and * we simply want to increment it. If the cursor's recno is NOT set, * we're starting our iteration in a tree that has only appended items. * In that case, recno will be 0 and happily enough the increment will * set it to 1, which is correct. */ __cursor_set_recno(cbt, cbt->recno + 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. * * The problem is that we don't know at this point whether there may be * multiple uncommitted changes at the end of the data, and it would be * expensive to check every time we hit an aborted update. If an * insert is aborted, we simply return zero (empty), regardless of * whether we are at the end of the data. */ if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL){ /*没有可见的记录值,直接返回0*/ cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return 0; }
/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; val->data = WT_UPDATE_DATA(upd); val->size = upd->size; break; } return (0); }
/* * __cursor_var_append_prev -- * Return the previous variable-length entry on the append list. */ static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_page; } for (;;) { WT_RET(__cursor_skip_prev(cbt)); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* NOTREACHED */ }
/* * __cursor_var_append_next -- * Return the next variable-length entry on the append list. */ static inline int __cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; if (newpage) { cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_page; } for (;;) { cbt->ins = WT_SKIP_NEXT(cbt->ins); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; if (upd->type == WT_UPDATE_DELETED) { if (__wt_txn_upd_visible_all(session, upd)) ++cbt->page_deleted_count; continue; } return (__wt_value_return(session, cbt, upd)); } /* NOTREACHED */ }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); cbt->recno = WT_INSERT_RECNO(cbt->ins); } else { if (cbt->recno == WT_INSERT_RECNO(cbt->ins)) { __cursor_skip_prev(cbt); if (cbt->ins == NULL) return (WT_NOTFOUND); } --cbt->recno; } /* * Column store appends are inherently non-transactional. * * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. */ cbt->iface.recno = cbt->recno; if (cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); }
/* * __cursor_fix_prev -- * Move to the previous, fixed-length column-store item. */ static inline int __cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage) { WT_BTREE *btree; WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = session->btree; val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_last_recno(cbt->page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { if (cbt->recno == cbt->page->u.col_fix.recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno - 1); new_page: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SINGLE(cbt->page); cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); } cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt); val->data = &cbt->v; val->size = 1; return (0); } /* NOTREACHED */ }
/*btree cursor移向下一个记录(fix col方式存储),在btree树空间上移动*/ static inline int __cursor_fix_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_BTREE *btree; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; val = &cbt->iface.value; /*切换到新的page上做next操作*/ if (newpage){ cbt->last_standard_recno = __col_fix_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_fix_recno); goto new_page; } /*记录序号超出最后一个序号,到末尾了*/ if (cbt->recno >= cbt->last_standard_recno) return WT_NOTFOUND; __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*获得第一个修改列表*/ cbt->ins_head = WT_COL_UPDATE_SINGLE(page); /*定位recno所在的修改条目*/ cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) /*不正确的修改定位,将ins设置为NULL表示定位失败*/ cbt->ins = NULL; /*做事务隔离读取记录版本*/ upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL){ cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); /*V赋值*/ val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return 0; }
/* * __txn_op_log -- * Log an operation for the current transaction. */ static int __txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_ITEM key, value; WT_UPDATE *upd; uint64_t recno; WT_CLEAR(key); upd = op->u.upd; value.data = WT_UPDATE_DATA(upd); value.size = upd->size; /* * Log the operation. It must be one of the following: * 1) column store remove; * 2) column store insert/update; * 3) row store remove; or * 4) row store insert/update. */ if (cbt->btree->type == BTREE_ROW) { WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); if (WT_UPDATE_DELETED_ISSET(upd)) WT_ERR(__wt_logop_row_remove_pack(session, logrec, op->fileid, &key)); else WT_ERR(__wt_logop_row_put_pack(session, logrec, op->fileid, &key, &value)); } else { recno = WT_INSERT_RECNO(cbt->ins); WT_ASSERT(session, recno != WT_RECNO_OOB); if (WT_UPDATE_DELETED_ISSET(upd)) WT_ERR(__wt_logop_col_remove_pack(session, logrec, op->fileid, recno)); else WT_ERR(__wt_logop_col_put_pack(session, logrec, op->fileid, recno, &value)); } err: __wt_buf_free(session, &key); return (ret); }
/* * __cursor_fix_next -- * Move to the next, fixed-length column-store item. */ static inline int __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_BTREE *btree; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_fix_last_recno(cbt->ref); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->ref->ref_recno); goto new_page; } /* Move to the next entry and return the item. */ if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno + 1); new_page: /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SINGLE(page); cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; } else cbt->iface.value.data = upd->data; cbt->iface.value.size = 1; return (0); }
/* * __col_insert_alloc -- * Column-store insert: allocate a WT_INSERT structure and fill it in. */ static int __col_insert_alloc(WT_SESSION_IMPL *session, uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) { WT_INSERT *ins; size_t ins_size; /* * Allocate the WT_INSERT structure and skiplist pointers, then copy * the record number into place. */ ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *); WT_RET(__wt_calloc(session, 1, ins_size, &ins)); WT_INSERT_RECNO(ins) = recno; *insp = ins; *ins_sizep = ins_size; return (0); }
/*在append list上移动variable-length类型的btree cursor*/ static inline int __cursor_var_append_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; val = &cbt->iface.value; if (newpage){ cbt->ins = WT_SKIP_FIRST(cbt->ins_head); goto new_page; } for (;;){ cbt->ins = WT_SKIP_NEXT(cbt->ins); new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); /*事务隔离读,对本事务不可见,继续向前*/ if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) continue; /*删除集合,不做指向这条记录,继续向下移动*/ if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } /*赋值value*/ val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return 0; } }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else { /* * Handle the special case of leading implicit records, that is, * there aren't any records in the tree not on the append list, * and the first record on the append list isn't record 1. * * The "right" place to handle this is probably in our caller. * The high-level cursor-previous routine would: * -- call this routine to walk the append list * -- call the routine to walk the standard page items * -- call the tree walk routine looking for a previous page * Each of them returns WT_NOTFOUND, at which point our caller * checks the cursor record number, and if it's larger than 1, * returns the implicit records. Instead, I'm trying to detect * the case here, mostly because I don't want to put that code * into our caller. Anyway, if this code breaks for any reason, * that's the way I'd go. * * If we're not pointing to a WT_INSERT entry, or we can't find * a WT_INSERT record that precedes our record name-space, check * if there are any records on the page. If there aren't, then * we're in the magic zone, keep going until we get to a record * number of 1. */ if (cbt->ins != NULL && cbt->recno <= WT_INSERT_RECNO(cbt->ins)) WT_RET(__cursor_skip_prev(cbt)); if (cbt->ins == NULL && (cbt->recno == 1 || __col_fix_last_recno(page) != 0)) return (WT_NOTFOUND); } /* * This code looks different from the cursor-next code. The append * list appears on the last page of the tree and contains the last * records in the tree. If we're iterating through the tree, starting * at the last record in the tree, by definition we're starting a new * iteration and we set the record number to the last record found in * the tree. Otherwise, decrement the record. */ if (newpage) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); else __cursor_set_recno(cbt, cbt->recno - 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. */ if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); val->size = 1; return (0); }
/*移向下条variable-length column-store 记录*/ static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_ITEM *val; WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle, rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ if (newpage){ cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, page->pg_var_recno); goto new_page; } for (;;){ if (cbt->recno >= cbt->last_standard_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno + 1); new_page: /*定位到recno对应的WT_COL slot*/ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /*读取内容值*/ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. * upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值 */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if ((rle = __wt_cell_rle(&unpack)) == 1) continue; /*定位到修改列表中的记录*/ ins = __col_insert_search_gt(cbt->ins_head, cbt->recno); cbt->recno = rle_start + rle; if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop increment. */ --cbt->recno; continue; } /*取出cell中的值到tmp中*/ WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp.data; val->size = cbt->tmp.size; return 0; } }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 }; WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; bool append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; upd = upd_arg; append = logged = false; if (upd_arg == NULL) { if (modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE) { /* * Fixed-size column-store doesn't have on-page deleted * values, it's a nul byte. */ if (modify_type == WT_UPDATE_TOMBSTONE && btree->type == BTREE_COL_FIX) { modify_type = WT_UPDATE_STANDARD; value = &col_fix_remove; } } /* * There's a chance the application specified a record past the * last record on the page. If that's the case and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. * Ignore any information obtained from the search. */ WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0); if (cbt->compare != 0 && (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(cbt->ref) : __col_fix_last_recno(cbt->ref)))) { append = true; cbt->ins = NULL; cbt->ins_head = NULL; } } /* We're going to modify the page, we should have loaded history. */ WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO); /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; /* * If modifying a record not previously modified, but which is in the * same update slot as a previously modified record, cursor.ins will * not be set because there's no list of update records for this recno, * but cursor.ins_head will be set to point to the correct update slot. * Acquire the necessary insert information, then create a new update * entry and link it into the existing list. We get here if a page has * a single cell representing multiple records (the records have the * same value), and then a record in the cell is updated or removed, * creating the update list for the cell, and then a cursor iterates * into that same cell to update/remove a different record. We find the * correct slot in the update array, but we don't find an update list * (because it doesn't exist), and don't have the information we need * to do the insert. Normally, we wouldn't care (we could fail and do * a search for the record which would configure everything for the * insert), but range truncation does this pattern for every record in * the cell, and the performance is terrible. For that reason, catch it * here. */ if (cbt->ins == NULL && cbt->ins_head != NULL) { cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); if (cbt->ins != NULL) { if (WT_INSERT_RECNO(cbt->ins) == recno) cbt->compare = 0; else { /* * The test below is for cursor.compare set to 0 * and cursor.ins set: cursor.compare wasn't set * by the search we just did, and has an unknown * value. Clear cursor.ins to avoid the test. */ cbt->ins = NULL; } } } /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, cursor.ins will be set to * point to the correct update list. Create a new update entry and link * it into the existing list. * * Else, allocate an insert array as necessary, build an insert/update * structure pair, and link it into place. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size, false)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_append, ins_headp, 1); ins_headp = &mod->mod_col_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; /* * Check for insert split and checkpoint races in column-store: * it's easy (as opposed to in row-store) and a difficult bug to * otherwise diagnose. */ WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB || (recno != WT_RECNO_OOB && mod->mod_col_split_recno > recno)); if (upd_arg == NULL) { WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth, exclusive)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth, exclusive)); } /* If the update was successful, add it to the in-memory log. */ if (logged && modify_type != WT_UPDATE_RESERVE) { WT_ERR(__wt_txn_log_op(session, cbt)); /* * In case of append, the recno (key) for the value is assigned * now. Set the recno in the transaction operation to be used * incase this transaction is prepared to retrieve the update * corresponding to this operation. */ __wt_txn_op_set_recno(session, cbt->recno); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); if (upd_arg == NULL) __wt_free(session, upd); } return (ret); }
/* * __cursor_var_prev -- * Move to the previous, variable-length column-store item. */ static inline int __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; uint64_t rle_start; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ /* Initialize for each new page. */ if (newpage) { cbt->last_standard_recno = __col_var_last_recno(page); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); goto new_page; } /* Move to the previous entry and return the item. */ for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); new_page: if (cbt->recno < page->pg_var_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* * If we're at the same slot as the last reference and there's * no matching insert list item, re-use the return information * (so encoded items with large repeat counts aren't repeatedly * decoded). Otherwise, unpack the cell and build the return * information. */ if (cbt->cip_saved != cip) { if ((cell = WT_COL_PTR(page, cip)) == NULL) continue; __wt_cell_unpack(cell, &unpack); if (unpack.type == WT_CELL_DEL) { if (__wt_cell_rle(&unpack) == 1) continue; /* * There can be huge gaps in the variable-length * column-store name space appearing as deleted * records. If more than one deleted record, do * the work of finding the next record to return * instead of looping through the records. * * First, find the largest record in the update * list that's smaller than the current record. */ ins = __col_insert_search_lt( cbt->ins_head, cbt->recno); /* * Second, for records with RLEs greater than 1, * the above call to __col_var_search located * this record in the page's list of repeating * records, and returned the starting record. * The starting record - 1 is the record to * which we could skip, if there was no larger * record in the update list. */ cbt->recno = rle_start - 1; if (ins != NULL && WT_INSERT_RECNO(ins) > cbt->recno) cbt->recno = WT_INSERT_RECNO(ins); /* Adjust for the outer loop decrement. */ ++cbt->recno; continue; } WT_RET(__wt_page_cell_data_ref( session, page, &unpack, cbt->tmp)); cbt->cip_saved = cip; } val->data = cbt->tmp->data; val->size = cbt->tmp->size; return (0); } /* NOTREACHED */ }
/* * __txn_op_log -- * Log an operation for the current transaction. */ static int __txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt) { WT_CURSOR *cursor; WT_ITEM value; WT_UPDATE *upd; uint64_t recno; cursor = &cbt->iface; upd = op->u.upd; value.data = upd->data; value.size = upd->size; /* * Log the row- or column-store insert, modify, remove or update. Our * caller doesn't log reserve operations, we shouldn't see them here. */ if (cbt->btree->type == BTREE_ROW) { #ifdef HAVE_DIAGNOSTIC __txn_op_log_row_key_check(session, cbt); #endif switch (upd->type) { case WT_UPDATE_DELETED: WT_RET(__wt_logop_row_remove_pack( session, logrec, op->fileid, &cursor->key)); break; case WT_UPDATE_MODIFIED: WT_RET(__wt_logop_row_modify_pack( session, logrec, op->fileid, &cursor->key, &value)); break; case WT_UPDATE_STANDARD: WT_RET(__wt_logop_row_put_pack( session, logrec, op->fileid, &cursor->key, &value)); break; WT_ILLEGAL_VALUE(session); } } else { recno = WT_INSERT_RECNO(cbt->ins); WT_ASSERT(session, recno != WT_RECNO_OOB); switch (upd->type) { case WT_UPDATE_DELETED: WT_RET(__wt_logop_col_remove_pack( session, logrec, op->fileid, recno)); break; case WT_UPDATE_MODIFIED: WT_RET(__wt_logop_col_modify_pack( session, logrec, op->fileid, recno, &value)); break; case WT_UPDATE_STANDARD: WT_RET(__wt_logop_col_put_pack( session, logrec, op->fileid, recno, &value)); break; WT_ILLEGAL_VALUE(session); } } return (0); }
/* * __wt_col_append_serial_func -- * Server function to append an WT_INSERT entry to the tree. */ int __wt_col_append_serial_func(WT_SESSION_IMPL *session, void *args) { WT_BTREE *btree; WT_INSERT *ins, *new_ins, ***ins_stack, **next_stack; WT_INSERT_HEAD *inshead, **insheadp, **new_inslist, *new_inshead; WT_PAGE *page; uint64_t recno; uint32_t write_gen; u_int i, skipdepth; btree = S2BT(session); __wt_col_append_unpack(args, &page, &write_gen, &insheadp, &ins_stack, &next_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); /* Check the page's write-generation. */ WT_RET(__wt_page_write_gen_check(session, page, write_gen)); if ((inshead = *insheadp) == NULL) inshead = new_inshead; /* * If the application specified a record number, there's a race: the * application may have searched for the record, not found it, then * called into the append code, and another thread might have added * the record. Fortunately, we're in the right place because if the * record didn't exist at some point, it can only have been created * on this list. Search for the record, if specified. */ if ((recno = WT_INSERT_RECNO(new_ins)) == 0) recno = WT_INSERT_RECNO(new_ins) = ++btree->last_recno; ins = __col_insert_search(inshead, ins_stack, next_stack, recno); /* If we find the record number, there's been a race. */ if (ins != NULL && WT_INSERT_RECNO(ins) == recno) WT_RET(WT_RESTART); /* * Publish: First, point the new WT_INSERT item's skiplist references * to the next elements in the insert list, then flush memory. Second, * update the skiplist elements that reference the new WT_INSERT item, * this ensures the list is never inconsistent. */ for (i = 0; i < skipdepth; i++) new_ins->next[i] = *ins_stack[i]; WT_WRITE_BARRIER(); for (i = 0; i < skipdepth; i++) { if (inshead->tail[i] == NULL || ins_stack[i] == &inshead->tail[i]->next[i]) inshead->tail[i] = new_ins; *ins_stack[i] = new_ins; } __wt_col_append_new_ins_taken(args); /* * If the insert head does not yet have an insert list, our caller * passed us one. * * NOTE: it is important to do this after the item has been added to * the list. Code can assume that if the list is set, it is non-empty. */ if (*insheadp == NULL) { WT_PUBLISH(*insheadp, new_inshead); __wt_col_append_new_inshead_taken(args); } /* * If the page does not yet have an insert array, our caller passed * us one. * * NOTE: it is important to do this after publishing the list entry. * Code can assume that if the array is set, it is non-empty. */ if (page->modify->append == NULL) { page->modify->append = new_inslist; __wt_col_append_new_inslist_taken(args); } /* * If we don't find the record, check to see if we extended the file, * and update the last record number. */ if (recno > btree->last_recno) btree->last_recno = recno; __wt_page_and_tree_modify_set(session, page); return (0); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins, *ins_copy; WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist; WT_ITEM *value, _value; WT_PAGE *page; WT_UPDATE *old_upd, *upd, *upd_obsolete; size_t ins_size, new_inshead_size, new_inslist_size, upd_size; uint64_t recno; u_int skipdepth; int i, logged; btree = cbt->btree; page = cbt->page; recno = cbt->iface.recno; logged = 0; WT_ASSERT(session, op != 1); switch (op) { case 2: /* Remove */ if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; break; case 3: /* Insert/Update */ default: value = &cbt->iface.value; /* * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. */ if (recno == 0 || recno > __col_last_recno(page)) op = 1; break; } /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); ins = NULL; new_inshead = NULL; new_inslist = NULL; upd = NULL; /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, create a new WT_UPDATE * entry and have a serialized function link it into an existing * WT_INSERT entry's WT_UPDATE list. * * Else, allocate an insert array as necessary, build a WT_INSERT and * WT_UPDATE structure pair, and call a serialized function to insert * the WT_INSERT structure. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* Make sure the update can proceed. */ WT_ERR( __wt_update_check(session, page, old_upd = cbt->ins->upd)); /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, &upd->txnid)); logged = 1; /* Serialize the update. */ WT_ERR(__wt_update_serial(session, page, cbt->write_gen, &cbt->ins->upd, old_upd, NULL, 0, &upd, upd_size, &upd_obsolete)); /* Discard any obsolete WT_UPDATE structures. */ if (upd_obsolete != NULL) __wt_update_obsolete_free(session, page, upd_obsolete); } else { /* Make sure the update can proceed. */ WT_ERR(__wt_update_check(session, page, NULL)); /* There may be no insert list, allocate as necessary. */ new_inshead_size = new_inslist_size = 0; if (op == 1) { if (page->modify->append == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); WT_ERR( __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; } else inshead = &page->modify->append[0]; cbt->ins_head = *inshead; } else if (page->type == WT_PAGE_COL_FIX) { if (page->modify->update == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); WT_ERR( __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; } else inshead = &page->modify->update[0]; } else { if (page->modify->update == NULL) { new_inslist_size = page->entries * sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_def( session, page->entries, &new_inslist)); inshead = &new_inslist[cbt->slot]; } else inshead = &page->modify->update[cbt->slot]; } /* There may be no WT_INSERT list, allocate as necessary. */ if (*inshead == NULL) { new_inshead_size = sizeof(WT_INSERT_HEAD); WT_ERR(__wt_calloc_def(session, 1, &new_inshead)); for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { cbt->ins_stack[i] = &new_inshead->head[i]; cbt->next_stack[i] = NULL; } cbt->ins_head = new_inshead; } /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it. */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, &upd->txnid)); logged = 1; ins->upd = upd; ins_size += upd_size; cbt->ins = ins; /* Insert or append the WT_INSERT structure. */ if (op == 1) { /* * The serialized function clears ins: take a copy of * the pointer so we can look up the record number. */ ins_copy = ins; WT_ERR(__wt_col_append_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, cbt->next_stack, &new_inslist, new_inslist_size, &new_inshead, new_inshead_size, &ins, ins_size, skipdepth)); /* Put the new recno into the cursor. */ cbt->recno = WT_INSERT_RECNO(ins_copy); } else WT_ERR(__wt_insert_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, cbt->next_stack, &new_inslist, new_inslist_size, &new_inshead, new_inshead_size, &ins, ins_size, skipdepth)); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); __wt_free(session, upd); } __wt_free(session, new_inslist); __wt_free(session, new_inshead); return (ret); }
/* * __cursor_skip_prev -- * Move back one position in a skip list stack (aka "finger"). */ static inline int __cursor_skip_prev(WT_CURSOR_BTREE *cbt) { WT_INSERT *current, *ins; WT_ITEM key; WT_SESSION_IMPL *session; int i; session = (WT_SESSION_IMPL *)cbt->iface.session; restart: /* * If the search stack does not point at the current item, fill it in * with a search. */ while ((current = cbt->ins) != PREV_INS(cbt, 0)) { if (cbt->btree->type == BTREE_ROW) { key.data = WT_INSERT_KEY(current); key.size = WT_INSERT_KEY_SIZE(current); WT_RET(__wt_search_insert(session, cbt, &key)); } else cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, WT_INSERT_RECNO(current)); } /* * Find the first node up the search stack that does not move. * * The depth of the current item must be at least this level, since we * see it in that many levels of the stack. * * !!! Watch these loops carefully: they all rely on the value of i, * and the exit conditions to end up with the right values are * non-trivial. */ ins = NULL; /* -Wconditional-uninitialized */ for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++) if ((ins = PREV_INS(cbt, i + 1)) != current) break; /* * Find a starting point for the new search. That is either at the * non-moving node if we found a valid node, or the beginning of the * next list down that is not the current node. * * Since it is the beginning of a list, and we know the current node is * has a skip depth at least this high, any node we find must sort * before the current node. */ if (ins == NULL || ins == current) for (; i >= 0; i--) { cbt->ins_stack[i] = NULL; cbt->next_stack[i] = NULL; ins = cbt->ins_head->head[i]; if (ins != NULL && ins != current) break; } /* Walk any remaining levels until just before the current node. */ while (i >= 0) { /* * If we get to the end of a list without finding the current * item, we must have raced with an insert. Restart the search. */ if (ins == NULL) { cbt->ins_stack[0] = NULL; cbt->next_stack[0] = NULL; goto restart; } if (ins->next[i] != current) /* Stay at this level */ ins = ins->next[i]; else { /* Drop down a level */ cbt->ins_stack[i] = &ins->next[i]; cbt->next_stack[i] = ins->next[i]; --i; } } /* If we found a previous node, the next one must be current. */ if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current) goto restart; cbt->ins = PREV_INS(cbt, 0); return (0); }
/* * __cursor_fix_append_prev -- * Return the previous fixed-length entry on the append list. */ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) return (WT_NOTFOUND); } else { /* Move to the previous record in the append list, if any. */ if (cbt->ins != NULL && cbt->recno <= WT_INSERT_RECNO(cbt->ins)) WT_RET(__cursor_skip_prev(cbt)); /* * Handle the special case of leading implicit records, that is, * there aren't any records in the page not on the append list, * and the append list's first record isn't the first record on * the page. (Although implemented as a test of the page values, * this is really a test for a tree where the first inserted * record wasn't record 1, any other page with only an append * list will have a first page record number matching the first * record in the append list.) * * The "right" place to handle this is probably in our caller. * The high-level cursor-previous routine would: * -- call this routine to walk the append list * -- call the routine to walk the standard page items * -- call the tree walk routine looking for a previous page * Each of them returns WT_NOTFOUND, at which point our caller * checks the cursor record number, and if it's larger than 1, * returns the implicit records. Instead, I'm trying to detect * the case here, mostly because I don't want to put that code * into our caller. Anyway, if this code breaks for any reason, * that's the way I'd go. * * If we're not pointing to a WT_INSERT entry (we didn't find a * WT_INSERT record preceding our record name-space), check if * we've reached the beginning of this page, a possibility if a * page had a large number of items appended, and then split. * If not, check if there are any records on the page. If there * aren't, then we're in the magic zone, keep going until we get * to a record number matching the first record on the page. */ if (cbt->ins == NULL && (cbt->recno == cbt->ref->ref_recno || __col_fix_last_recno(cbt->ref) != 0)) return (WT_NOTFOUND); } /* * This code looks different from the cursor-next code. The append list * may be preceded by other rows. If we're iterating through the tree, * starting at the last record in the tree, by definition we're starting * a new iteration and we set the record number to the last record found * on the page. Otherwise, decrement the record. */ if (newpage) __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); else __cursor_set_recno(cbt, cbt->recno - 1); /* * Fixed-width column store appends are inherently non-transactional. * Even a non-visible update by a concurrent or aborted transaction * changes the effective end of the data. The effect is subtle because * of the blurring between deleted and empty values, but ideally we * would skip all uncommitted changes at the end of the data. This * doesn't apply to variable-width column stores because the implicitly * created records written by reconciliation are deleted and so can be * never seen by a read. */ if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; cbt->iface.value.data = &cbt->v; } else cbt->iface.value.data = upd->data; cbt->iface.value.size = 1; return (0); }