/* * __curfile_update_check -- * Check whether an update would conflict. * * This function expects the cursor to already be positioned. It should * be called before deciding whether to skip an update operation based on * existence of a visible update for a key -- even if there is no value * visible to the transaction, an update could still conflict. */ static int __curfile_update_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_SESSION_IMPL *session; btree = cbt->btree; session = (WT_SESSION_IMPL *)cbt->iface.session; if (cbt->compare != 0) return (0); if (cbt->ins != NULL) return (__wt_txn_update_check(session, cbt->ins->upd)); if (btree->type == BTREE_ROW && cbt->ref->page->pg_row_upd != NULL) return (__wt_txn_update_check( session, cbt->ref->page->pg_row_upd[cbt->slot])); return (0); }
/* * __wt_update_check -- * Check whether an update can proceed, and maintain the first txnid in * the page->modify structure. */ int __wt_update_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *next) { WT_TXN *txn; /* Before allocating anything, make sure this update is permitted. */ WT_RET(__wt_txn_update_check(session, next)); /* * Record the transaction ID for the first update to a page. * We don't care if this races: there is a buffer built into the * check for ancient updates. */ txn = &session->txn; if (page->modify->first_id == WT_TXN_NONE && txn->id != WT_TXN_NONE) page->modify->first_id = txn->id; return (0); }
/* * __wt_btcur_update_check -- * Check whether an update would conflict. * * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so * they only check for conflicts without updating the tree. It is used to * maintain snapshot isolation for transactions that span multiple chunks * in an LSM tree. */ int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; cursor = &cbt->iface; btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, 1)); /* * We are only interested in checking for conflicts. */ if (cbt->compare == 0 && cbt->ins != NULL) ret = __wt_txn_update_check(session, cbt->ins->upd); break; case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_row_modify -- * Row-store insert, update and delete. */ int __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) { WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM *key, *value; WT_PAGE *page; WT_UPDATE *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; u_int i, skipdepth; int logged; key = &cbt->iface.key; value = is_remove ? NULL : &cbt->iface.value; page = cbt->page; /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); ins = NULL; upd = NULL; logged = 0; /* * Modify: allocate an update array as necessary, build a WT_UPDATE * structure, and call a serialized function to insert the WT_UPDATE * structure. * * Insert: allocate an insert array as necessary, build a WT_INSERT * and WT_UPDATE structure pair, and call a serialized function to * insert the WT_INSERT structure. */ if (cbt->compare == 0) { if (cbt->ins == NULL) { /* Allocate an update array as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, page->u.row.upd, upd_entry, page->entries); /* Set the WT_UPDATE array reference. */ upd_entry = &page->u.row.upd[cbt->slot]; } else upd_entry = &cbt->ins->upd; /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check(session, old_upd = *upd_entry)); /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, cbt, upd)); logged = 1; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, upd_entry, &upd, upd_size)); } else { /* * Allocate the insert array as necessary. * * We allocate an additional insert array slot for insert keys * sorting less than any key on the page. The test to select * that slot is baroque: if the search returned the first page * slot, we didn't end up processing an insert list, and the * comparison value indicates the search key was smaller than * the returned slot, then we're using the smallest-key insert * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, page->u.row.ins, ins_headp, page->entries + 1); ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? page->entries : cbt->slot; ins_headp = &page->u.row.ins[ins_slot]; /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it. */ WT_ERR(__wt_row_insert_alloc( session, key, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); ins->upd = upd; ins_size += upd_size; /* * Update the cursor: the WT_INSERT_HEAD might be allocated, * the WT_INSERT was allocated. */ cbt->ins_head = ins_head; cbt->ins = ins; WT_ERR(__wt_txn_modify(session, cbt, upd)); logged = 1; /* * If there was no insert list during the search, the cursor's * information cannot be correct, search couldn't have * initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (WT_SKIP_FIRST(ins_head) == NULL) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Insert the WT_INSERT structure. */ WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth)); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); cbt->ins = NULL; __wt_free(session, upd); } return (ret); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; WT_UPDATE *old_upd; size_t ins_size, upd_size; u_int i, skipdepth; int append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; append = logged = 0; /* This code expects a remove to have a NULL value. */ if (is_remove) { if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; } else { /* * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. In addition, a recno of 0 * implies an append operation, we're allocating a new row. */ if (recno == 0 || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page))) append = 1; } /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, create a new WT_UPDATE * entry and have a serialized function link it into an existing * WT_INSERT entry's WT_UPDATE list. * * Else, allocate an insert array as necessary, build a WT_INSERT and * WT_UPDATE structure pair, and call a serialized function to insert * the WT_INSERT structure. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); logged = 1; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_append, ins_headp, 1); ins_headp = &page->modify->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_update, ins_headp, 1); ins_headp = &page->modify->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, page->modify->mod_update, ins_headp, page->pg_var_entries); ins_headp = &page->modify->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; if (upd == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); logged = 1; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth)); } /* If the update was successful, add it to the in-memory log. */ if (logged) WT_ERR(__wt_txn_log_op(session, cbt)); if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); __wt_free(session, upd); } return (ret); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 }; WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; bool append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; upd = upd_arg; append = logged = false; if (upd_arg == NULL) { if (modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE) { /* * Fixed-size column-store doesn't have on-page deleted * values, it's a nul byte. */ if (modify_type == WT_UPDATE_TOMBSTONE && btree->type == BTREE_COL_FIX) { modify_type = WT_UPDATE_STANDARD; value = &col_fix_remove; } } /* * There's a chance the application specified a record past the * last record on the page. If that's the case and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. * Ignore any information obtained from the search. */ WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0); if (cbt->compare != 0 && (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(cbt->ref) : __col_fix_last_recno(cbt->ref)))) { append = true; cbt->ins = NULL; cbt->ins_head = NULL; } } /* We're going to modify the page, we should have loaded history. */ WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO); /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; /* * If modifying a record not previously modified, but which is in the * same update slot as a previously modified record, cursor.ins will * not be set because there's no list of update records for this recno, * but cursor.ins_head will be set to point to the correct update slot. * Acquire the necessary insert information, then create a new update * entry and link it into the existing list. We get here if a page has * a single cell representing multiple records (the records have the * same value), and then a record in the cell is updated or removed, * creating the update list for the cell, and then a cursor iterates * into that same cell to update/remove a different record. We find the * correct slot in the update array, but we don't find an update list * (because it doesn't exist), and don't have the information we need * to do the insert. Normally, we wouldn't care (we could fail and do * a search for the record which would configure everything for the * insert), but range truncation does this pattern for every record in * the cell, and the performance is terrible. For that reason, catch it * here. */ if (cbt->ins == NULL && cbt->ins_head != NULL) { cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); if (cbt->ins != NULL) { if (WT_INSERT_RECNO(cbt->ins) == recno) cbt->compare = 0; else { /* * The test below is for cursor.compare set to 0 * and cursor.ins set: cursor.compare wasn't set * by the search we just did, and has an unknown * value. Clear cursor.ins to avoid the test. */ cbt->ins = NULL; } } } /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, cursor.ins will be set to * point to the correct update list. Create a new update entry and link * it into the existing list. * * Else, allocate an insert array as necessary, build an insert/update * structure pair, and link it into place. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size, false)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_append, ins_headp, 1); ins_headp = &mod->mod_col_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; /* * Check for insert split and checkpoint races in column-store: * it's easy (as opposed to in row-store) and a difficult bug to * otherwise diagnose. */ WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB || (recno != WT_RECNO_OOB && mod->mod_col_split_recno > recno)); if (upd_arg == NULL) { WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth, exclusive)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth, exclusive)); } /* If the update was successful, add it to the in-memory log. */ if (logged && modify_type != WT_UPDATE_RESERVE) { WT_ERR(__wt_txn_log_op(session, cbt)); /* * In case of append, the recno (key) for the value is assigned * now. Set the recno in the transaction operation to be used * incase this transaction is prepared to retrieve the update * corresponding to this operation. */ __wt_txn_op_set_recno(session, cbt->recno); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); if (upd_arg == NULL) __wt_free(session, upd); } return (ret); }