/* * __realloc_func -- * ANSI realloc function. */ static int __realloc_func(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, bool clear_memory, void *retp) { void *p; size_t bytes_allocated; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. * * Sometimes we're allocating memory and we don't care about the * final length -- bytes_allocated_ret may be NULL. */ p = *(void **)retp; bytes_allocated = (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; WT_ASSERT(session, (p == NULL && bytes_allocated == 0) || (p != NULL && (bytes_allocated_ret == NULL || bytes_allocated != 0))); WT_ASSERT(session, bytes_to_allocate != 0); WT_ASSERT(session, bytes_allocated < bytes_to_allocate); if (session != NULL) { if (p == NULL) WT_STAT_CONN_INCR(session, memory_allocation); else WT_STAT_CONN_INCR(session, memory_grow); } if ((p = realloc(p, bytes_to_allocate)) == NULL) WT_RET_MSG(session, __wt_errno(), "memory allocation of %" WT_SIZET_FMT " bytes failed", bytes_to_allocate); /* * Clear the allocated memory, parts of WiredTiger depend on allocated * memory being cleared. */ if (clear_memory) memset((uint8_t *)p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) *bytes_allocated_ret = bytes_to_allocate; *(void **)retp = p; return (0); }
/* * __curlog_search -- * WT_CURSOR.search method for the log cursor type. */ static int __curlog_search(WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LSN key; WT_SESSION_IMPL *session; uint32_t counter, key_file, key_offset, raw; cl = (WT_CURSOR_LOG *)cursor; /* Temporarily turn off raw so we can do direct cursor operations. */ raw = F_MASK(cursor, WT_CURSTD_RAW); F_CLR(cursor, WT_CURSTD_RAW); CURSOR_API_CALL(cursor, session, search, NULL); /* * !!! We are ignoring the counter and only searching based on the LSN. */ WT_ERR(__wt_cursor_get_key(cursor, &key_file, &key_offset, &counter)); WT_SET_LSN(&key, key_file, key_offset); ret = __wt_log_scan(session, &key, WT_LOGSCAN_ONE, __curlog_logrec, cl); if (ret == ENOENT) ret = WT_NOTFOUND; WT_ERR(ret); WT_ERR(__curlog_kv(session, cursor)); WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); err: F_SET(cursor, raw); API_END_RET(session, ret); }
/* * __curds_remove -- * WT_CURSOR.remove method for the data-source cursor type. */ static int __curds_remove(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_STAT_CONN_INCR(session, cursor_remove); WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); WT_ERR(__curds_txn_enter(session, true)); WT_ERR(__curds_key_set(cursor)); ret = __curds_cursor_resolve(cursor, source->remove(source)); err: __curds_txn_leave(session); CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curlog_search -- * WT_CURSOR.search method for the log cursor type. */ static int __curlog_search(WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LSN key; WT_SESSION_IMPL *session; uint32_t counter, key_file, key_offset; cl = (WT_CURSOR_LOG *)cursor; CURSOR_API_CALL(cursor, session, search, NULL); /* * !!! We are ignoring the counter and only searching based on the LSN. */ WT_ERR(__wt_cursor_get_key((WT_CURSOR *)cl, &key_file, &key_offset, &counter)); WT_SET_LSN(&key, key_file, key_offset); ret = __wt_log_scan(session, &key, WT_LOGSCAN_ONE, __curlog_logrec, cl); if (ret == ENOENT) ret = WT_NOTFOUND; WT_ERR(ret); WT_ERR(__curlog_kv(session, cursor)); WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); err: API_END_RET(session, ret); }
/* * __curds_insert -- * WT_CURSOR.insert method for the data-source cursor type. */ static int __curds_insert(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_UPDATE_API_CALL(cursor, session, insert); WT_ERR(__curds_txn_enter(session, true)); WT_STAT_CONN_INCR(session, cursor_insert); WT_STAT_DATA_INCR(session, cursor_insert); WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (!F_ISSET(cursor, WT_CURSTD_APPEND)) WT_ERR(__curds_key_set(cursor)); WT_ERR(__curds_value_set(cursor)); ret = __curds_cursor_resolve(cursor, source->insert(source)); err: __curds_txn_leave(session); CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __wt_txn_global_shutdown -- * Shut down the global transaction state. */ int __wt_txn_global_shutdown(WT_SESSION_IMPL *session) { bool txn_active; /* * We're shutting down. Make sure everything gets freed. * * It's possible that the eviction server is in the middle of a long * operation, with a transaction ID pinned. In that case, we will loop * here until the transaction ID is released, when the oldest * transaction ID will catch up with the current ID. */ for (;;) { WT_RET(__wt_txn_activity_check(session, &txn_active)); if (!txn_active) break; WT_STAT_CONN_INCR(session, txn_release_blocked); __wt_yield(); } #ifdef HAVE_TIMESTAMPS /* * Now that all transactions have completed, no timestamps should be * pinned. */ __wt_timestamp_set_inf(&S2C(session)->txn_global.pinned_timestamp); #endif return (0); }
/* * __wt_malloc -- * ANSI malloc function. */ int __wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp) { void *p; /* * Defensive: if our caller doesn't handle errors correctly, ensure a * free won't fail. */ *(void **)retp = NULL; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ WT_ASSERT(session, bytes_to_allocate != 0); if (session != NULL) WT_STAT_CONN_INCR(session, memory_allocation); if ((p = malloc(bytes_to_allocate)) == NULL) WT_RET_MSG(session, __wt_errno(), "memory allocation of %" WT_SIZET_FMT " bytes failed", bytes_to_allocate); *(void **)retp = p; return (0); }
/* * __wt_free_int -- * ANSI free function. */ void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg) { void *p; p = *(void **)p_arg; if (p == NULL) /* ANSI C free semantics */ return; /* * If there's a serialization bug we might race with another thread. * We can't avoid the race (and we aren't willing to flush memory), * but we minimize the window by clearing the free address, hoping a * racing thread will see, and won't free, a NULL pointer. */ *(void **)p_arg = NULL; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ if (session != NULL) WT_STAT_CONN_INCR(session, memory_free); free(p); }
/* * __wt_log_slot_switch -- * Switch out the current slot and set up a new one. */ int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) { WT_DECL_RET; WT_LOG *log; log = S2C(session)->log; /* * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the * compiler does not like it combined directly with the while loop * here. * * The loop conditional is a bit complex. We have to retry if we * closed the slot but were unable to set up a new slot. In that * case the flag indicating we have closed the slot will still be set. * We have to retry in that case regardless of the retry setting * because we are responsible for setting up the new slot. */ do { WT_WITH_SLOT_LOCK(session, log, ret = __log_slot_switch_internal( session, myslot, forced, did_work)); if (ret == EBUSY) { WT_STAT_CONN_INCR(session, log_slot_switch_busy); __wt_yield(); } WT_RET(WT_SESSION_CHECK_PANIC(session)); if (F_ISSET(S2C(session), WT_CONN_CLOSING)) break; } while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY)); return (ret); }
/* * __curlog_next -- * WT_CURSOR.next method for the step log cursor type. */ static int __curlog_next(WT_CURSOR *cursor) { WT_CURSOR_LOG *cl; WT_DECL_RET; WT_SESSION_IMPL *session; cl = (WT_CURSOR_LOG *)cursor; CURSOR_API_CALL(cursor, session, next, NULL); /* * If we don't have a record, or went to the end of the record we * have, or we are in the zero-fill portion of the record, get a * new one. */ if (cl->stepp == NULL || cl->stepp >= cl->stepp_end || !*cl->stepp) { cl->txnid = 0; ret = __wt_log_scan(session, cl->next_lsn, WT_LOGSCAN_ONE, __curlog_logrec, cl); if (ret == ENOENT) ret = WT_NOTFOUND; WT_ERR(ret); } WT_ASSERT(session, cl->logrec->data != NULL); WT_ERR(__curlog_kv(session, cursor)); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); err: API_END_RET(session, ret); }
/* * __ovfl_read -- * Read an overflow item from the disk. */ static int __ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *store) { WT_BTREE *btree; const WT_PAGE_HEADER *dsk; btree = S2BT(session); /* * Read the overflow item from the block manager, then reference the * start of the data and set the data's length. * * Overflow reads are synchronous. That may bite me at some point, but * WiredTiger supports large page sizes, overflow items should be rare. */ WT_RET(__wt_bt_read(session, store, addr, addr_size)); dsk = store->data; store->data = WT_PAGE_HEADER_BYTE(btree, dsk); store->size = dsk->u.datalen; WT_STAT_CONN_INCR(session, cache_read_overflow); WT_STAT_DATA_INCR(session, cache_read_overflow); return (0); }
/* * __log_wrlsn_server -- * The log wrlsn server thread. */ static WT_THREAD_RET __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LSN prev; WT_SESSION_IMPL *session; int yield; bool did_work; session = arg; conn = S2C(session); log = conn->log; yield = 0; WT_INIT_LSN(&prev); while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Write out any log record buffers if anything was done * since last time. Only call the function to walk the * slots if the system is not idle. On an idle system * the alloc_lsn will not advance and the written lsn will * match the alloc_lsn. */ if (__wt_log_cmp(&prev, &log->alloc_lsn) != 0 || __wt_log_cmp(&log->write_lsn, &log->alloc_lsn) != 0) __wt_log_wrlsn(session, &yield); else WT_STAT_CONN_INCR(session, log_write_lsn_skip); prev = log->alloc_lsn; did_work = yield == 0; /* * If __wt_log_wrlsn did work we want to yield instead of sleep. */ if (yield++ < WT_THOUSAND) __wt_yield(); else /* * Send in false because if we did any work we would * not be on this path. */ __wt_cond_auto_wait( session, conn->log_wrlsn_cond, did_work); } /* * On close we need to do this one more time because there could * be straggling log writes that need to be written. */ WT_ERR(__wt_log_force_write(session, 1, NULL)); __wt_log_wrlsn(session, NULL); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } return (WT_THREAD_RET_VALUE); }
/* * __log_prealloc_once -- * Perform one iteration of log pre-allocation. */ static int __log_prealloc_once(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; u_int i, reccount; char **recfiles; conn = S2C(session); log = conn->log; reccount = 0; recfiles = NULL; /* * Allocate up to the maximum number, accounting for any existing * files that may not have been used yet. */ WT_ERR(__wt_fs_directory_list( session, conn->log_path, WT_LOG_PREPNAME, &recfiles, &reccount)); /* * Adjust the number of files to pre-allocate if we find that * the critical path had to allocate them since we last ran. */ if (log->prep_missed > 0) { conn->log_prealloc += log->prep_missed; __wt_verbose(session, WT_VERB_LOG, "Missed %" PRIu32 ". Now pre-allocating up to %" PRIu32, log->prep_missed, conn->log_prealloc); } WT_STAT_CONN_SET(session, log_prealloc_max, conn->log_prealloc); /* * Allocate up to the maximum number that we just computed and detected. */ for (i = reccount; i < (u_int)conn->log_prealloc; i++) { WT_ERR(__wt_log_allocfile( session, ++log->prep_fileid, WT_LOG_PREPNAME)); WT_STAT_CONN_INCR(session, log_prealloc_files); } /* * Reset the missed count now. If we missed during pre-allocating * the log files, it means the allocation is not keeping up, not that * we didn't allocate enough. So we don't just want to keep adding * in more. */ log->prep_missed = 0; if (0) err: __wt_err(session, ret, "log pre-alloc server error"); WT_TRET(__wt_fs_directory_list_free(session, &recfiles, reccount)); return (ret); }
/* * __wt_btcur_reset -- * Invalidate the cursor position. */ int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) { WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); return (__cursor_reset(cbt)); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t checksum, size; bool mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); /* * Map the block if it's possible. */ handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) { buf->data = (uint8_t *)bm->map + offset; buf->size = size; ret = handle->fh_map_preload(handle, (WT_SESSION *)session, buf->data, buf->size,bm->mapped_cookie); WT_STAT_CONN_INCR(session, block_map_read); WT_STAT_CONN_INCRV(session, block_byte_map_read, size); return (ret); } #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * the available list, or for live systems, the discard list. */ WT_RET(__wt_block_misplaced(session, block, "read", offset, size, bm->is_live, __func__, __LINE__)); #endif /* Read the block. */ __wt_capacity_throttle(session, size, WT_THROTTLE_READ); WT_RET( __wt_block_read_off(session, block, buf, offset, size, checksum)); /* Optionally discard blocks from the system's buffer cache. */ WT_RET(__wt_block_discard(session, block, (size_t)size)); return (0); }
/* * __async_search -- * WT_ASYNC_OP->search implementation for op handles. */ static int __async_search(WT_ASYNC_OP *asyncop) { WT_ASYNC_OP_IMPL *op; WT_DECL_RET; WT_SESSION_IMPL *session; op = (WT_ASYNC_OP_IMPL *)asyncop; ASYNCOP_API_CALL(O2C(op), session, search); WT_STAT_CONN_INCR(O2S(op), async_op_search); WT_ERR(__async_op_wrap(op, WT_AOP_SEARCH)); err: API_END_RET(session, ret); }
/* * __cursor_truncate_fix -- * Discard a cursor range from fixed-width column-store tree. */ static int __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, bool)) { WT_DECL_RET; const uint8_t *value; /* * Handle fixed-length column-store objects separately: for row-store * and variable-length column-store objects we have "deleted" values * and so returned objects actually exist: fixed-length column-store * objects are filled-in if they don't exist, that is, if you create * record 37, records 1-36 magically appear. Those records can't be * deleted, which means we have to ignore already "deleted" records. * * First, call the standard cursor remove method to do a full search and * re-position the cursor because we don't have a saved copy of the * page's write generation information, which we need to remove records. * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ retry: WT_RET(__wt_btcur_remove(start)); /* * Reset ret each time through so that we don't loop forever in * the cursor equals case. */ for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, true)) != 0) break; start->compare = 0; /* Exact match */ value = (const uint8_t *)start->iface.value.data; if (*value != 0 && (ret = rmfunc(session, start, 1)) != 0) break; } if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __cursor_truncate -- * Discard a cursor range from row-store or variable-width column-store * tree. */ static int __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop, int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, bool)) { WT_DECL_RET; /* * First, call the standard cursor remove method to do a full search and * re-position the cursor because we don't have a saved copy of the * page's write generation information, which we need to remove records. * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. * * If this is a row-store, we delete leaf pages having no overflow items * without reading them; for that to work, we have to ensure we read the * page referenced by the ending cursor, since we may be deleting only a * partial page at the end of the truncation. Our caller already fully * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ retry: WT_RET(__wt_btcur_remove(start)); /* * Reset ret each time through so that we don't loop forever in * the cursor equals case. */ for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, true)) != 0) break; start->compare = 0; /* Exact match */ if ((ret = rmfunc(session, start, 1)) != 0) break; } if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } WT_RET_NOTFOUND_OK(ret); return (0); }
/* * __wt_session_compact_readonly -- * WT_SESSION.compact method; readonly version. */ int __wt_session_compact_readonly( WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; WT_UNUSED(uri); WT_UNUSED(config); session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL_NOCONF(session, compact); WT_STAT_CONN_INCR(session, session_table_compact_fail); ret = __wt_session_notsup(session); err: API_END_RET(session, ret); }
/* * __curds_reset -- * WT_CURSOR.reset method for the data-source cursor type. */ static int __curds_reset(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_API_CALL(cursor, session, reset, NULL); WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); WT_ERR(source->reset(source)); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); err: API_END_RET(session, ret); }
/* * __curds_prev -- * WT_CURSOR.prev method for the data-source cursor type. */ static int __curds_prev(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_API_CALL(cursor, session, prev, NULL); WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); WT_ERR(__curds_txn_enter(session, false)); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); ret = __curds_cursor_resolve(cursor, source->prev(source)); err: __curds_txn_leave(session); API_END_RET(session, ret); }
/* * __wt_btcur_update_check -- * Check whether an update would conflict. * * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so * they only check for conflicts without updating the tree. It is used to * maintain snapshot isolation for transactions that span multiple chunks * in an LSM tree. */ int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; cursor = &cbt->iface; btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * Just check for conflicts. */ ret = __curfile_update_check(cbt); break; case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__wt_illegal_value(session, NULL)); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __logmgr_force_ckpt -- * Force a checkpoint out, waiting for the checkpoint LSN in the log * is up to the given log number. */ static int __logmgr_force_ckpt(WT_SESSION_IMPL *session, uint32_t lognum) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_SESSION_IMPL *tmp_session; int yield; conn = S2C(session); log = conn->log; yield = 0; WT_RET(__wt_open_internal_session(conn, "compatibility-reconfig", true, 0, &tmp_session)); while (log->ckpt_lsn.l.file < lognum) { /* * Force a checkpoint to be written in the new log file and * force the archiving of all previous log files. We do the * checkpoint in the loop because the checkpoint LSN in the * log record could still reflect the previous log file in * cases such as the write LSN has not yet advanced into the * new log file due to another group of threads still in * progress with their slot copies or writes. */ WT_RET(tmp_session->iface.checkpoint( &tmp_session->iface, "force=1")); WT_RET(WT_SESSION_CHECK_PANIC(tmp_session)); /* * Only sleep in the rare case that we had to come through * this loop more than once. */ if (yield++) { WT_STAT_CONN_INCR(session, log_force_ckpt_sleep); __wt_sleep(0, WT_THOUSAND); } } WT_RET(tmp_session->iface.close(&tmp_session->iface, NULL)); return (0); }
/* * __wt_bm_preload -- * Pre-load a page. */ int __wt_bm_preload( WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { WT_BLOCK *block; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FILE_HANDLE *handle; wt_off_t offset; uint32_t checksum, size; bool mapped; block = bm->block; WT_STAT_CONN_INCR(session, block_preload); /* Crack the cookie. */ WT_RET( __wt_block_buffer_to_addr(block, addr, &offset, &size, &checksum)); handle = block->fh->handle; mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen; if (mapped && handle->fh_map_preload != NULL) ret = handle->fh_map_preload(handle, (WT_SESSION *)session, (uint8_t *)bm->map + offset, size, bm->mapped_cookie); if (!mapped && handle->fh_advise != NULL) ret = handle->fh_advise(handle, (WT_SESSION *)session, offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED); if (ret != EBUSY && ret != ENOTSUP) return (ret); /* If preload isn't supported, do it the slow way. */ WT_RET(__wt_scr_alloc(session, 0, &tmp)); ret = __wt_bm_read(bm, session, tmp, addr, addr_size); __wt_scr_free(session, &tmp); return (ret); }
/* * __curds_search -- * WT_CURSOR.search method for the data-source cursor type. */ static int __curds_search(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; WT_SESSION_IMPL *session; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; CURSOR_API_CALL(cursor, session, search, NULL); WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); WT_ERR(__curds_txn_enter(session, false)); WT_ERR(__curds_key_set(cursor)); ret = __curds_cursor_resolve(cursor, source->search(source)); err: __curds_txn_leave(session); API_END_RET(session, ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. There are two algorithms, one * where we select a record at random from the whole tree on each * retrieval and one where we first select a record at random from the * whole tree, and then subsequently sample forward from that location. * The sampling approach allows us to select reasonably uniform random * points from unbalanced trees. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.next_random only supported by row-store tables"); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); /* * If retrieving random values without sampling, or we don't have a * page reference, pick a roughly random leaf page in the tree. */ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { /* * Skip past the sample size of the leaf pages in the tree * between each random key return to compensate for unbalanced * trees. * * Use the underlying file size divided by its block allocation * size as our guess of leaf pages in the file (this can be * entirely wrong, as it depends on how many pages are in this * particular checkpoint, how large the leaf and internal pages * really are, and other factors). Then, divide that value by * the configured sample size and increment the final result to * make sure tiny files don't leave us with a skip value of 0. * * !!! * Ideally, the number would be prime to avoid restart issues. */ if (cbt->next_random_sample_size != 0) { WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; } /* * Choose a leaf page from the tree. */ WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); WT_ERR(ret); } else { /* * Read through the tree, skipping leaf pages. Be cautious about * the skip count: if the last leaf page skipped was also the * last leaf page in the tree, it may be set to zero on return * with the end-of-walk condition. * * Pages read for data sampling aren't "useful"; don't update * the read generation of pages already in memory, and if a page * is read, set its generation to a low value so it is evicted * quickly. */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); } /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); } return (0); err: WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_update); WT_STAT_DATA_INCR(session, cursor_update); WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key doesn't exist. If we * find an update for the key, check for conflicts. Update the * record if it exists. Creating a record past the end of the * tree in a fixed-length column-store implicitly fills the gap * with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, false); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, check for conflicts and fail if the key * does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_remove); WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible * in __cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* * Creating a record past the end of the tree in a * fixed-length column-store implicitly fills the * gap with empty records. Return success in that * case, the record was deleted successfully. * * Correct the btree cursor's location: the search * will have pointed us at the previous/next item, * and that's not correct. */ cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, true); break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__cursor_row_search(session, cbt, NULL, false)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, true); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_insert); WT_STAT_DATA_INCR(session, cursor_insert); WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). The real record number * is assigned by the serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, false)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0 && __cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact * match, search from the root. Unlike WT_CURSOR.search, ignore pinned * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. * * Set the "insert" flag for the btree row-store search; we may intend * to position the cursor at the end of the tree, rather than match an * existing record. */ valid = false; if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); /* * Search-near is trickier than search when searching an already * pinned page. If search returns the first or last page slots, * discard the results and search the full tree as the neighbor * pages might offer better matches. This test is simplistic as * we're ignoring append lists (there may be no page slots or we * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->pg_row_entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); valid = __cursor_valid(cbt, &upd); } /* * If we find a valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (valid) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }