/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET(ENOTSUP); WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(__wt_row_random(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else WT_ERR(__wt_btcur_search_near(cbt, 0)); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ WT_ERR(btree->type == BTREE_ROW ? __wt_row_random(session, cbt) : ENOTSUP); ret = cbt->compare == 0 ? __wt_kv_return(session, cbt) : WT_NOTFOUND; err: if (ret != 0) WT_TRET(__cursor_error_resolve(cbt)); return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 0) : __cursor_col_search(session, cbt)); if (cbt->compare == 0 && __cursor_valid(cbt, &upd)) ret = __wt_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; } else ret = WT_NOTFOUND; err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_ITEM *val; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_read); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); __cursor_func_init(cbt, 1); WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); if (cbt->compare != 0 || __cursor_invalid(cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ if (__cursor_fix_implicit(btree, cbt)) { cbt->v = 0; val = &cbt->iface.value; val->data = &cbt->v; val->size = 1; } else ret = WT_NOTFOUND; } else ret = __wt_kv_return(session, cbt, 0); err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. There are two algorithms, one * where we select a record at random from the whole tree on each * retrieval and one where we first select a record at random from the * whole tree, and then subsequently sample forward from that location. * The sampling approach allows us to select reasonably uniform random * points from unbalanced trees. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.next_random only supported by row-store tables"); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); /* * If retrieving random values without sampling, or we don't have a * page reference, pick a roughly random leaf page in the tree. */ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { /* * Skip past the sample size of the leaf pages in the tree * between each random key return to compensate for unbalanced * trees. * * Use the underlying file size divided by its block allocation * size as our guess of leaf pages in the file (this can be * entirely wrong, as it depends on how many pages are in this * particular checkpoint, how large the leaf and internal pages * really are, and other factors). Then, divide that value by * the configured sample size and increment the final result to * make sure tiny files don't leave us with a skip value of 0. * * !!! * Ideally, the number would be prime to avoid restart issues. */ if (cbt->next_random_sample_size != 0) { WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; } /* * Choose a leaf page from the tree. */ WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); WT_ERR(ret); } else { /* * Read through the tree, skipping leaf pages. Be cautious about * the skip count: if the last leaf page skipped was also the * last leaf page in the tree, it may be set to zero on return * with the end-of-walk condition. * * Pages read for data sampling aren't "useful"; don't update * the read generation of pages already in memory, and if a page * is read, set its generation to a low value so it is evicted * quickly. */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); } /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); } return (0); err: WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_update); WT_STAT_DATA_INCR(session, cursor_update); WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key doesn't exist. If we * find an update for the key, check for conflicts. Update the * record if it exists. Creating a record past the end of the * tree in a fixed-length column-store implicitly fills the gap * with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, false); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, check for conflicts and fail if the key * does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact * match, search from the root. Unlike WT_CURSOR.search, ignore pinned * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. * * Set the "insert" flag for the btree row-store search; we may intend * to position the cursor at the end of the tree, rather than match an * existing record. */ valid = false; if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); /* * Search-near is trickier than search when searching an already * pinned page. If search returns the first or last page slots, * discard the results and search the full tree as the neighbor * pages might offer better matches. This test is simplistic as * we're ignoring append lists (there may be no page slots or we * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->pg_row_entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); valid = __cursor_valid(cbt, &upd); } /* * If we find a valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (valid) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); /* * If we have a page pinned, search it; if we don't have a page pinned, * or the search of the pinned page doesn't find an exact match, search * from the root. */ valid = false; if (F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); } if (valid) ret = __wt_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; } else ret = WT_NOTFOUND; #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; exact = 0; WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); /* * Set the "insert" flag for the btree row-store search; we may intend * to position our cursor at the end of the tree, rather than match an * existing record. */ WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); /* * If we find an valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) exact = -1; } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact) { WT_BTREE *btree; WT_ITEM *val; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_read_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); __cursor_func_init(cbt, 1); WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); /* * Creating a record past the end of the tree in a fixed-length column- * store implicitly fills the gap with empty records. In this case, we * instantiate the empty record, it's an exact match. * * Else, if we find a valid key (one that wasn't deleted), return it. * * Else, if we found a deleted key, try to move to the next key in the * tree (bias for prefix searches). Cursor next skips deleted records, * so we don't have to test for them again. * * Else if there's no larger tree key, redo the search and try and find * an earlier record. If that fails, quit, there's no record to return. */ if (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)) { cbt->v = 0; val = &cbt->iface.value; val->data = &cbt->v; val->size = 1; *exact = 0; } else if (!__cursor_invalid(cbt)) { *exact = cbt->compare; ret = __wt_kv_return(session, cbt, cbt->compare == 0 ? 0 : 1); } else if ((ret = __wt_btcur_next(cbt)) != WT_NOTFOUND) *exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); if (!__cursor_invalid(cbt)) { *exact = cbt->compare; ret = __wt_kv_return( session, cbt, cbt->compare == 0 ? 0 : 1); } else if ((ret = __wt_btcur_prev(cbt)) != WT_NOTFOUND) *exact = -1; } err: __cursor_func_resolve(cbt, ret); return (ret); }