/* * __wt_btcur_update_check -- * Check whether an update would conflict. * * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so * they only check for conflicts without updating the tree. It is used to * maintain snapshot isolation for transactions that span multiple chunks * in an LSM tree. */ int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; cursor = &cbt->iface; btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, 1)); /* * Just check for conflicts. */ ret = __curfile_update_check(cbt); break; case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET(ENOTSUP); WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(__wt_row_random(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else WT_ERR(__wt_btcur_search_near(cbt, 0)); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_reset -- * Invalidate the cursor position. */ int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) { WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); return (__cursor_reset(cbt)); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 0) : __cursor_col_search(session, cbt)); if (cbt->compare == 0 && __cursor_valid(cbt, &upd)) ret = __wt_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; } else ret = WT_NOTFOUND; err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. There are two algorithms, one * where we select a record at random from the whole tree on each * retrieval and one where we first select a record at random from the * whole tree, and then subsequently sample forward from that location. * The sampling approach allows us to select reasonably uniform random * points from unbalanced trees. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.next_random only supported by row-store tables"); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); /* * If retrieving random values without sampling, or we don't have a * page reference, pick a roughly random leaf page in the tree. */ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { /* * Skip past the sample size of the leaf pages in the tree * between each random key return to compensate for unbalanced * trees. * * Use the underlying file size divided by its block allocation * size as our guess of leaf pages in the file (this can be * entirely wrong, as it depends on how many pages are in this * particular checkpoint, how large the leaf and internal pages * really are, and other factors). Then, divide that value by * the configured sample size and increment the final result to * make sure tiny files don't leave us with a skip value of 0. * * !!! * Ideally, the number would be prime to avoid restart issues. */ if (cbt->next_random_sample_size != 0) { WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; } /* * Choose a leaf page from the tree. */ WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); WT_ERR(ret); } else { /* * Read through the tree, skipping leaf pages. Be cautious about * the skip count: if the last leaf page skipped was also the * last leaf page in the tree, it may be set to zero on return * with the end-of-walk condition. * * Pages read for data sampling aren't "useful"; don't update * the read generation of pages already in memory, and if a page * is read, set its generation to a low value so it is evicted * quickly. */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); } /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); } return (0); err: WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_update); WT_STAT_DATA_INCR(session, cursor_update); WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key doesn't exist. If we * find an update for the key, check for conflicts. Update the * record if it exists. Creating a record past the end of the * tree in a fixed-length column-store implicitly fills the gap * with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, false); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, check for conflicts and fail if the key * does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_remove); WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible * in __cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* * Creating a record past the end of the tree in a * fixed-length column-store implicitly fills the * gap with empty records. Return success in that * case, the record was deleted successfully. * * Correct the btree cursor's location: the search * will have pointed us at the previous/next item, * and that's not correct. */ cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, true); break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__cursor_row_search(session, cbt, NULL, false)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, true); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_insert); WT_STAT_DATA_INCR(session, cursor_insert); WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). The real record number * is assigned by the serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, false)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0 && __cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact * match, search from the root. Unlike WT_CURSOR.search, ignore pinned * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. * * Set the "insert" flag for the btree row-store search; we may intend * to position the cursor at the end of the tree, rather than match an * existing record. */ valid = false; if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); /* * Search-near is trickier than search when searching an already * pinned page. If search returns the first or last page slots, * discard the results and search the full tree as the neighbor * pages might offer better matches. This test is simplistic as * we're ignoring append lists (there may be no page slots or we * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->pg_row_entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); valid = __cursor_valid(cbt, &upd); } /* * If we find a valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (valid) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); /* * If we have a page pinned, search it; if we don't have a page pinned, * or the search of the pinned page doesn't find an exact match, search * from the root. */ valid = false; if (F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); } if (valid) ret = __wt_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; } else ret = WT_NOTFOUND; #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; exact = 0; WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); /* * Set the "insert" flag for the btree row-store search; we may intend * to position our cursor at the end of the tree, rather than match an * existing record. */ WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); /* * If we find an valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) exact = -1; } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ flags = WT_READ_SKIP_INTL; /* tree walk flags */ if (truncating) LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * Column-store pages may have appended entries. Handle * it separately from the usual cursor code, it's in a * simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif if (ret == 0) F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/*将btree cursor移动到下一个记录*/ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); /*btree 扫描标示*/ flags = WT_READ_SKIP_INTL; if (truncating) LF_SET(WT_READ_TRUNCATE); /*激活一个btree cursor*/ WT_RET(__cursor_func_init(cbt, 0)); /*初始化cursor*/ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /*对btree的扫描*/ for (;;){ page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /*column store append方式,在insert header上做扫描*/ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)){ switch (page->type){ case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; /*清除掉column store的标记*/ F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL){ switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } /*找到对应的记录了,直接返回*/ if (ret != WT_NOTFOUND) break; /*假如是column store方式,检查是否要扫描insert header list*/ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /*删除的记录太多,对page进行重组,增大page的填充因子*/ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))){ __wt_page_evict_soon(page); } cbt->page_deleted_count = 0; /*btree cursor跳转到下一个page上*/ WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) /*失败了,恢复cursor的状态*/ WT_TRET(__cursor_reset(cbt)); return ret; }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int skipped, newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ for (skipped = newpage = 0;; skipped = 0, newpage = 1) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next( cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage, &skipped); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /* * If we scanned all the way through a page and only saw * deleted records, try to evict the page as we release it. * Otherwise repeatedly deleting from the beginning of a tree * can have quadratic performance. */ if (newpage && skipped) page->read_gen = WT_READGEN_OLDEST; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }