/* * __cursor_truncate_fix -- * Discard a cursor range from fixed-width column-store tree. */ static int __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int)) { WT_DECL_RET; uint8_t *value; /* * Handle fixed-length column-store objects separately: for row-store * and variable-length column-store objects we have "deleted" values * and so returned objects actually exist: fixed-length column-store * objects are filled-in if they don't exist, that is, if you create * record 37, records 1-36 magically appear. Those records can't be * deleted, which means we have to ignore already "deleted" records. * * First, call the standard cursor remove method to do a full search and * re-position the cursor because we don't have a saved copy of the * page's write generation information, which we need to remove records. * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ if (start == NULL) { do { WT_RET(__wt_btcur_remove(stop)); for (;;) { if ((ret = __wt_btcur_prev(stop, 1)) != 0) break; stop->compare = 0; /* Exact match */ value = (uint8_t *)stop->iface.value.data; if (*value != 0 && (ret = rmfunc(session, stop, 1)) != 0) break; } } while (ret == WT_RESTART); } else { do { WT_RET(__wt_btcur_remove(start)); for (;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, 1)) != 0) break; start->compare = 0; /* Exact match */ value = (uint8_t *)start->iface.value.data; if (*value != 0 && (ret = rmfunc(session, start, 1)) != 0) break; } } while (ret == WT_RESTART); } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __cursor_truncate -- * Discard a cursor range from row-store or variable-width column-store * tree. */ static int __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, int)) { WT_DECL_RET; /* * First, call the standard cursor remove method to do a full search and * re-position the cursor because we don't have a saved copy of the * page's write generation information, which we need to remove records. * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. * * If this is a row-store, we delete leaf pages having no overflow items * without reading them; for that to work, we have to ensure we read the * page referenced by the ending cursor, since we may be deleting only a * partial page at the end of the truncation. Our caller already fully * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ if (start == NULL) { do { WT_RET(__wt_btcur_remove(stop)); for (;;) { if ((ret = __wt_btcur_prev(stop, 1)) != 0) break; stop->compare = 0; /* Exact match */ if ((ret = rmfunc(session, stop, 1)) != 0) break; } } while (ret == WT_RESTART); } else { do { WT_RET(__wt_btcur_remove(start)); /* * Reset ret each time through so that we don't loop * forever in the cursor equals case. */ for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, 1)) != 0) break; start->compare = 0; /* Exact match */ if ((ret = rmfunc(session, start, 1)) != 0) break; } } while (ret == WT_RESTART); } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __curfile_prev -- * WT_CURSOR->prev method for the btree cursor type. */ static int __curfile_prev(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, prev, cbt->btree); ret = __wt_btcur_prev((WT_CURSOR_BTREE *)cursor, 0); API_END(session); return (ret); }
/* * __curfile_prev -- * WT_CURSOR->prev method for the btree cursor type. */ static int __curfile_prev(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, prev, cbt->btree); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); if ((ret = __wt_btcur_prev(cbt, 0)) == 0) F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); }
/* * __curfile_prev -- * WT_CURSOR->prev method for the btree cursor type. */ static int __curfile_prev(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, prev, cbt->btree); WT_ERR(__wt_btcur_prev(cbt, false)); /* Prev maintains a position, key and value. */ WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET(ENOTSUP); WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); WT_WITH_PAGE_INDEX(session, ret = __wt_row_random(session, cbt)); WT_ERR(ret); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { if ((ret = __wt_btcur_next(cbt, 0)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, 0); WT_ERR(ret); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. There are two algorithms, one * where we select a record at random from the whole tree on each * retrieval and one where we first select a record at random from the * whole tree, and then subsequently sample forward from that location. * The sampling approach allows us to select reasonably uniform random * points from unbalanced trees. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.next_random only supported by row-store tables"); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); /* * If retrieving random values without sampling, or we don't have a * page reference, pick a roughly random leaf page in the tree. */ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { /* * Skip past the sample size of the leaf pages in the tree * between each random key return to compensate for unbalanced * trees. * * Use the underlying file size divided by its block allocation * size as our guess of leaf pages in the file (this can be * entirely wrong, as it depends on how many pages are in this * particular checkpoint, how large the leaf and internal pages * really are, and other factors). Then, divide that value by * the configured sample size and increment the final result to * make sure tiny files don't leave us with a skip value of 0. * * !!! * Ideally, the number would be prime to avoid restart issues. */ if (cbt->next_random_sample_size != 0) { WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; } /* * Choose a leaf page from the tree. */ WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); WT_ERR(ret); } else { /* * Read through the tree, skipping leaf pages. Be cautious about * the skip count: if the last leaf page skipped was also the * last leaf page in the tree, it may be set to zero on return * with the end-of-walk condition. * * Pages read for data sampling aren't "useful"; don't update * the read generation of pages already in memory, and if a page * is read, set its generation to a low value so it is evicted * quickly. */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); } /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); } return (0); err: WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact * match, search from the root. Unlike WT_CURSOR.search, ignore pinned * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. * * Set the "insert" flag for the btree row-store search; we may intend * to position the cursor at the end of the tree, rather than match an * existing record. */ valid = false; if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); /* * Search-near is trickier than search when searching an already * pinned page. If search returns the first or last page slots, * discard the results and search the full tree as the neighbor * pages might offer better matches. This test is simplistic as * we're ignoring append lists (there may be no page slots or we * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->pg_row_entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); valid = __cursor_valid(cbt, &upd); } /* * If we find a valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (valid) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; exact = 0; WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); /* * Set the "insert" flag for the btree row-store search; we may intend * to position our cursor at the end of the tree, rather than match an * existing record. */ WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); /* * If we find an valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) exact = -1; } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact) { WT_BTREE *btree; WT_ITEM *val; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_read_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); __cursor_func_init(cbt, 1); WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); /* * Creating a record past the end of the tree in a fixed-length column- * store implicitly fills the gap with empty records. In this case, we * instantiate the empty record, it's an exact match. * * Else, if we find a valid key (one that wasn't deleted), return it. * * Else, if we found a deleted key, try to move to the next key in the * tree (bias for prefix searches). Cursor next skips deleted records, * so we don't have to test for them again. * * Else if there's no larger tree key, redo the search and try and find * an earlier record. If that fails, quit, there's no record to return. */ if (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)) { cbt->v = 0; val = &cbt->iface.value; val->data = &cbt->v; val->size = 1; *exact = 0; } else if (!__cursor_invalid(cbt)) { *exact = cbt->compare; ret = __wt_kv_return(session, cbt, cbt->compare == 0 ? 0 : 1); } else if ((ret = __wt_btcur_next(cbt)) != WT_NOTFOUND) *exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); if (!__cursor_invalid(cbt)) { *exact = cbt->compare; ret = __wt_kv_return( session, cbt, cbt->compare == 0 ? 0 : 1); } else if ((ret = __wt_btcur_prev(cbt)) != WT_NOTFOUND) *exact = -1; } err: __cursor_func_resolve(cbt, ret); return (ret); }