/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; bool busy, cache_work, oldgen, stalled; int force_attempts; btree = S2BT(session); for (oldgen = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: if (LF_ISSET(WT_READ_NO_EMPTY) && __wt_delete_page_skip(session, ref, false)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_DISK: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = true; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = true; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, ref)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = true; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ page = ref->page; if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += WT_THOUSAND; else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed) { WT_BM *bm; WT_BTREE *btree; WT_ITEM *ip; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t len, src_len, dst_len, result_len, size; int data_cksum, compression_failed; uint8_t *src, *dst; btree = S2BT(session); bm = btree->bm; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (checkpoint == 0 && addr != NULL && addr_sizep != NULL) || (checkpoint == 1 && addr == NULL && addr_sizep == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(&tmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (btree->compressor == NULL || btree->compressor->compress == NULL || compressed) ip = buf; else if (buf->size <= btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &tmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); result_len += WT_BLOCK_COMPRESS_SKIP; /* * If compression fails, or doesn't gain us at least one unit of * allocation, fallback to the original version. This isn't * unexpected: if compression doesn't work for some chunk of * data for some reason (noting likely additional format/header * information which compressed output requires), it just means * the uncompressed version is as good as it gets, and that's * what we use. */ if (compression_failed || buf->size / btree->allocsize == result_len / btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_fail); } else { compressed = 1; WT_STAT_FAST_DATA_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); tmp->size = result_len; ip = tmp; } } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = 1; break; case CKSUM_OFF: data_cksum = 0; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); WT_STAT_FAST_CONN_INCR(session, cache_write); WT_STAT_FAST_DATA_INCR(session, cache_write); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs * are contiguous. The purpose of this function is to advance the * write_lsn in LSN order after the buffer is written to the log file. */ void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; WT_LSN save_lsn; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; __wt_spin_lock(session, &log->log_writelsn_lock); restart: coalescing = NULL; WT_INIT_LSN(&save_lsn); written_i = 0; i = 0; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { if (yield != NULL) *yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce * contiguous ranges of written slots. */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; /* * The log server thread pushes out slots periodically. * Sometimes they are empty slots. If we find an * empty slot, where empty means the start and end LSN * are the same, free it and continue. */ if (__wt_log_cmp(&slot->slot_start_lsn, &slot->slot_release_lsn) == 0 && __wt_log_cmp(&slot->slot_start_lsn, &slot->slot_end_lsn) == 0) { __wt_log_slot_free(session, slot); continue; } if (coalescing != NULL) { /* * If the write_lsn changed, we may be able to * process slots. Try again. */ if (__wt_log_cmp( &log->write_lsn, &save_lsn) != 0) goto restart; if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to coalesce * and free. */ coalescing->slot_last_offset = slot->slot_last_offset; coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); /* * Copy the flag for later closing. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) F_SET(coalescing, WT_SLOT_CLOSEFH); } else { /* * If this written slot is not the next LSN, * try to start coalescing with later slots. * A synchronous write may update write_lsn * so save the last one we saw to check when * coalescing slots. */ save_lsn = log->write_lsn; if (__wt_log_cmp( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0); /* * We need to maintain the starting offset of * a log record so that the checkpoint LSN * refers to the beginning of a real record. * The last offset in a slot is kept so that * the checkpoint LSN is close to the end of * the record. */ if (slot->slot_start_lsn.l.offset != slot->slot_last_offset) slot->slot_start_lsn.l.offset = (uint32_t)slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; __wt_cond_signal(session, log->log_write_cond); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) __wt_cond_signal( session, conn->log_file_cond); } __wt_log_slot_free(session, slot); } } __wt_spin_unlock(session, &log->log_writelsn_lock); }
/* * __wt_realloc_aligned -- * ANSI realloc function that aligns to buffer boundaries, configured with * the "buffer_alignment" key to wiredtiger_open. */ int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) { #if defined(HAVE_POSIX_MEMALIGN) WT_DECL_RET; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ if (session != NULL && S2C(session)->buffer_alignment > 0) { void *p, *newp; size_t bytes_allocated; /* * Sometimes we're allocating memory and we don't care about the * final length -- bytes_allocated_ret may be NULL. */ p = *(void **)retp; bytes_allocated = (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; WT_ASSERT(session, (p == NULL && bytes_allocated == 0) || (p != NULL && (bytes_allocated_ret == NULL || bytes_allocated != 0))); WT_ASSERT(session, bytes_to_allocate != 0); WT_ASSERT(session, bytes_allocated < bytes_to_allocate); if (session != NULL) WT_STAT_FAST_CONN_INCR(session, memory_allocation); if ((ret = posix_memalign(&newp, S2C(session)->buffer_alignment, bytes_to_allocate)) != 0) WT_RET_MSG(session, ret, "memory allocation"); if (p != NULL) memcpy(newp, p, bytes_allocated); __wt_free(session, p); p = newp; /* Clear the allocated memory (see above). */ memset((uint8_t *)p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) *bytes_allocated_ret = bytes_to_allocate; *(void **)retp = p; return (0); } #endif /* * If there is no posix_memalign function, or no alignment configured, * fall back to realloc. */ return (__wt_realloc( session, bytes_allocated_ret, bytes_to_allocate, retp)); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCR(session, cursor_insert); WT_STAT_FAST_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = 0; __wt_btree_evictable(session, 1); } retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). First we search for the * maximum possible record number so the search ends on the * last page. The real record number is assigned by the * serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = UINT64_MAX; WT_ERR(__cursor_col_search(session, cbt, NULL)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = 0; /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, 0)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, 1)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0 && __cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, 0); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) { WT_STAT_FAST_CONN_INCR(session, cursor_restart); WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; } /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCR(session, cursor_update); WT_STAT_FAST_DATA_INCRV( session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = 0; __wt_btree_evictable(session, 1); } retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key doesn't exist. If we * find an update for the key, check for conflicts. Update the * record if it exists. Creating a record past the end of the * tree in a fixed-length column-store implicitly fills the gap * with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, 0); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, 1)); /* * If not overwriting, check for conflicts and fail if the key * does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, 0); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) { WT_STAT_FAST_CONN_INCR(session, cursor_restart); WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; } /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; exact = 0; WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_func_init(cbt, 1)); /* * Set the "insert" flag for the btree row-store search; we may intend * to position our cursor at the end of the tree, rather than match an * existing record. */ WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); /* * If we find an valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, 1) : __cursor_col_search(session, cbt)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) exact = -1; } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ WT_STAT_FAST_CONN_INCR(session, cursor_search); WT_STAT_FAST_DATA_INCR(session, cursor_search); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); /* * If we have a page pinned, search it; if we don't have a page pinned, * or the search of the pinned page doesn't find an exact match, search * from the root. */ valid = 0; if (F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, 0) : __cursor_col_search(session, cbt, cbt->ref)); valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, 1)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, 0) : __cursor_col_search(session, cbt, NULL)); valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); } if (valid) ret = __wt_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; } else ret = WT_NOTFOUND; err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. * * Possible optimization: if the page is already deleted and the delete * is visible to us (the delete has been committed), we could skip the * page instead of instantiating it and figuring out there are no rows * in the page. While that's a huge amount of work to no purpose, it's * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the page's cell type, cells for leaf pages without * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) ? ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_STAT_FAST_CONN_INCR(session, rec_page_delete_fast); WT_STAT_FAST_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/*将buffer的数据写入到block对应的文件中,并计算checksum和size*/ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, int data_cksum, int caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; int local_locked; blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; local_locked = 0; /*buf不是对齐模式,不能进行写,因为这个是和磁盘相关的写入,必须是对齐的*/ if(!F_ISSET(buf, WT_ITEM_ALIGNED)){ WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /*计算buf->size按block对齐,对齐后有可能会比现有的buf->memsize大,如果大的话,不能进行写,有可能会缓冲区溢出*/ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } /*超过4G*/ if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /*将对其后pading的buffer位置进行清0*/ memset((uint8_t*)buf->mem + buf->size, 0, align_size - buf->size); /*设置block header,计算存储的数据长度*/ blk->disk_size = WT_STORE_SIZE(align_size); blk->flags = 0; if(data_cksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); /*计算buf的cksum*/ blk->cksum = __wt_cksum(buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); if (!caller_locked) { WT_RET(__wt_block_ext_prealloc(session, 5)); __wt_spin_lock(session, &block->live_lock); local_locked = 1; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); /*判断文件是否需要进行扩大,如果不扩大就有可能存不下写入的block数据*/ if(ret == 0 && fh->extend_len != 0 && (fh->extend_size <= fh->size || (offset + fh->extend_len <= fh->extend_size && offset + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))){ /*调整extend_size为原来的offset + extend_len的两倍*/ fh->extend_size = offset + fh->extend_len * 2; if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) { /*释放block->live_lock的自旋锁,因为重设文件大小会时间比较长,需要先释放自旋锁,防止CPU空转*/ if (!fh->fallocate_requires_locking && local_locked) { __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } /*扩大文件的占用空间*/ if ((ret = __wt_fallocate(session,fh, offset, fh->extend_len * 2)) == ENOTSUP) { ret = 0; goto extend_truncate; } } else{ extend_truncate: if (!caller_locked && local_locked == 0) { __wt_spin_lock(session, &block->live_lock); local_locked = 1; } /*直接调整文件大小,这个比__wt_fallocate更慢*/ if ((ret = __wt_ftruncate(session, fh, offset + fh->extend_len * 2)) == EBUSY) ret = 0; } } if(local_locked){ __wt_spin_unlock(session, &block->live_lock); local_locked = 0; } WT_RET(ret); /*进行block的数据写入*/ ret =__wt_write(session, fh, offset, align_size, buf->mem); if (ret != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); /*没写成功,将ext对应的数据返回给avail list*/ WT_TRET(__wt_block_off_free(session, block, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } #ifdef HAVE_SYNC_FILE_RANGE /*需要进行fsync操作,脏页太多,进行一次异步刷盘*/ if (block->os_cache_dirty_max != 0 && (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && __wt_session_can_wait(session)) { block->os_cache_dirty = 0; WT_RET(__wt_fsync_async(session, fh)); } #endif #ifdef HAVE_POSIX_FADVISE /*清理fh->fd文件对应的system page cache中的数据,这个过程可能会有IO操作,相当于同步的sync调用*/ if (block->os_cache_max != 0 && (block->os_cache += align_size) > block->os_cache_max) { block->os_cache = 0; if ((ret = posix_fadvise(fh->fd, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); WT_RET(__wt_verbose(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *cksump = blk->cksum; return ret; }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(etmp); WT_DECL_ITEM(tmp); WT_DECL_RET; WT_ENCRYPTOR *encryptor; WT_ITEM *ip; const WT_PAGE_HEADER *dsk; const char *fail_msg; size_t result_len; btree = S2BT(session); bm = btree->bm; fail_msg = NULL; /* -Wuninitialized */ /* * If anticipating a compressed or encrypted block, read into a scratch * buffer and decompress into the caller's buffer. Else, read directly * into the caller's buffer. */ if (btree->compressor == NULL && btree->kencryptor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; ip = NULL; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; ip = tmp; } /* * If the block is encrypted, copy the skipped bytes of the original * image into place, then decrypt. */ if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) { if (btree->kencryptor == NULL || (encryptor = btree->kencryptor->encryptor) == NULL || encryptor->decrypt == NULL) { fail_msg = "encrypted block in file for which no encryption " "configured"; goto corrupt; } WT_ERR(__wt_scr_alloc(session, 0, &etmp)); if ((ret = __wt_decrypt(session, encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) { fail_msg = "block decryption failed"; goto corrupt; } ip = etmp; dsk = ip->data; } else if (btree->kencryptor != NULL) { fail_msg = "unencrypted block in file for which encryption configured"; goto corrupt; } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) { fail_msg = "compressed block in file for which no compression " "configured"; goto corrupt; } /* * Size the buffer based on the in-memory bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { fail_msg = "block decompression failed"; goto corrupt; } } else /* * If we uncompressed above, the page is in the correct buffer. * If we get here the data may be in the wrong buffer and the * buffer may be the wrong size. If needed, get the page * into the destination buffer. */ if (ip != NULL) WT_ERR(__wt_buf_set( session, buf, ip->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); if (0) { corrupt: if (ret == 0) ret = WT_ERROR; if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { __wt_err(session, ret, "%s", fail_msg); ret = __wt_illegal_value(session, btree->dhandle->name); } } err: __wt_scr_free(session, &tmp); __wt_scr_free(session, &etmp); return (ret); }
/* * __wt_cond_wait_signal -- * Wait on a mutex, optionally timing out. If we get it * before the time out period expires, let the caller know. */ int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) { DWORD err, milliseconds; WT_DECL_RET; uint64_t milliseconds64; bool locked; locked = false; /* Fast path if already signalled. */ *signalled = true; if (__wt_atomic_addi32(&cond->waiters, 1) == 0) return (0); /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL) { WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "wait %s cond (%p)", cond->name, cond)); WT_STAT_FAST_CONN_INCR(session, cond_wait); } EnterCriticalSection(&cond->mtx); locked = true; if (usecs > 0) { milliseconds64 = usecs / 1000; /* * Check for 32-bit unsigned integer overflow * INFINITE is max unsigned int on Windows */ if (milliseconds64 >= INFINITE) milliseconds64 = INFINITE - 1; milliseconds = (DWORD)milliseconds64; /* * 0 would mean the CV sleep becomes a TryCV which we do not * want */ if (milliseconds == 0) milliseconds = 1; ret = SleepConditionVariableCS( &cond->cond, &cond->mtx, milliseconds); } else ret = SleepConditionVariableCS( &cond->cond, &cond->mtx, INFINITE); /* * SleepConditionVariableCS returns non-zero on success, 0 on timeout * or failure. Check for timeout, else convert to a WiredTiger error * value and fail. */ if (ret == 0) { if ((err = GetLastError()) == ERROR_TIMEOUT) *signalled = false; else ret = __wt_errno(); } else ret = 0; (void)__wt_atomic_subi32(&cond->waiters, 1); if (locked) LeaveCriticalSection(&cond->mtx); if (ret == 0) return (0); WT_RET_MSG(session, ret, "SleepConditionVariableCS"); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; mode_t mode; int direct_io, f, fd, matched; const char *path; conn = S2C(session); fh = NULL; fd = -1; path = NULL; WT_VERBOSE_RET(session, fileops, "%s: open", name); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->refcnt; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); WT_RET(__wt_filename(session, name, &path)); f = O_RDWR; #ifdef O_BINARY /* Windows clones: we always want to treat the file as a binary. */ f |= O_BINARY; #endif #ifdef O_CLOEXEC /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. */ f |= O_CLOEXEC; #endif #ifdef O_NOATIME /* Avoid updating metadata for read-only workloads. */ if (dio_type == WT_FILE_TYPE_DATA) f |= O_NOATIME; #endif if (ok_create) { f |= O_CREAT; if (exclusive) f |= O_EXCL; mode = 0666; } else mode = 0; direct_io = 0; #ifdef O_DIRECT if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= O_DIRECT; direct_io = 1; } #endif if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) #ifdef O_DSYNC f |= O_DSYNC; #elif defined(O_SYNC) f |= O_SYNC; #else WT_ERR_MSG(session, ENOTSUP, "Unsupported log sync mode requested"); #endif WT_SYSCALL_RETRY(((fd = open(path, f, mode)) == -1 ? 1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); #if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC) /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. There's an obvious * race here, so we prefer the flag to open if available. */ if ((f = fcntl(fd, F_GETFD)) == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) WT_ERR_MSG(session, __wt_errno(), "%s: fcntl", name); #endif #if defined(HAVE_POSIX_FADVISE) /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA) WT_ERR(posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM)); #endif if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_ERR(__open_directory_sync(session)); WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->fd = fd; fh->refcnt = 1; fh->direct_io = direct_io; /* Set the file's size. */ WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA) fh->extend_len = conn->data_extend_len; /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->refcnt; *fhp = tfh; matched = 1; break; } if (!matched) { TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (fd != -1) (void)close(fd); } __wt_free(session, path); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; const WT_PAGE_HEADER *dsk; size_t result_len; btree = S2BT(session); bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len)); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(btree, WT_BTREE_VERIFY) || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, btree->dhandle->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs * are contiguous. Returns 1 if slots were freed, 0 if no slots were * freed in the progress arg. Must be called with the log slot lock held. */ int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; coalescing = NULL; written_i = 0; i = 0; if (free_i != NULL) *free_i = WT_SLOT_POOL; /* * Walk the array once saving any slots that are in the * WT_LOG_SLOT_WRITTEN state. */ while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; if (free_i != NULL && *free_i == WT_SLOT_POOL && slot->slot_state == WT_LOG_SLOT_FREE) *free_i = save_i; if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; written[written_i++].lsn = slot->slot_release_lsn; } /* * If we found any written slots process them. We sort them * based on the release LSN, and then look for them in order. */ if (written_i > 0) { /* * If wanted, reset the yield variable to indicate that we * have found written slots. */ if (yield != NULL) *yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce * contiguous ranges of written slots. */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; if (coalescing != NULL) { if (WT_LOG_CMP(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to coalesce * and free. */ coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); /* * Copy the flag for later closing. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) F_SET(coalescing, WT_SLOT_CLOSEFH); } else { /* * If this written slot is not the next LSN, * try to start coalescing with later slots. */ if (WT_LOG_CMP( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; } /* * If we get here we have a slot to process. * Advance the LSN and process the slot. */ WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, &slot->slot_release_lsn) == 0); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_RET(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) WT_RET(__wt_cond_signal( session, conn->log_file_cond)); } WT_RET(__wt_log_slot_free(session, slot)); if (free_i != NULL && *free_i == WT_SLOT_POOL && slot->slot_state == WT_LOG_SLOT_FREE) *free_i = save_i; } } return (0); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_read_blocked); break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; if ((ret = __wt_page_release_busy( session, ref, flags)) == EBUSY) { /* If forced eviction fails, stall. */ ret = 0; wait_cnt += 1000; } else WT_RET(ret); WT_STAT_FAST_CONN_INCR( session, page_forcible_evict_blocked); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (++wait_cnt < 1000) __wt_yield(); else { sleep_cnt = WT_MIN(wait_cnt, 10000); wait_cnt *= 2; WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } } }
/* * __wt_log_slot_join -- * Join a consolidated logging slot. Must be called with * the read lock held. */ void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join; #ifdef HAVE_DIAGNOSTIC bool unbuf_force; #endif conn = S2C(session); log = conn->log; /* * Make sure the length cannot overflow. The caller should not * even call this function if it doesn't fit but use direct * writes. */ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); /* * There should almost always be a slot open. */ #ifdef HAVE_DIAGNOSTIC unbuf_force = (++log->write_calls % 1000) == 0; #endif for (;;) { WT_BARRIER(); slot = log->active_slot; old_state = slot->slot_state; /* * Try to join our size into the existing size and * atomically write it back into the state. */ flag_state = WT_LOG_SLOT_FLAGS(old_state); released = WT_LOG_SLOT_RELEASED(old_state); join_offset = WT_LOG_SLOT_JOINED(old_state); #ifdef HAVE_DIAGNOSTIC if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { #else if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; F_SET(myslot, WT_MYSLOT_UNBUFFERED); myslot->slot = slot; } else new_join = join_offset + (int32_t)mysize; new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( (int64_t)new_join, (int64_t)released, (int64_t)flag_state); /* * Check if the slot is open for joining and we are able to * swap in our size into the state. */ if (WT_LOG_SLOT_OPEN(old_state) && __wt_atomic_casiv64( &slot->slot_state, old_state, new_state)) break; /* * The slot is no longer open or we lost the race to * update it. Yield and try again. */ WT_STAT_FAST_CONN_INCR(session, log_slot_races); __wt_yield(); } /* * We joined this slot. Fill in our information to return to * the caller. */ if (mysize != 0) WT_STAT_FAST_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) F_SET(slot, WT_SLOT_FLUSH); if (LF_ISSET(WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC); if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { WT_ASSERT(session, slot->slot_unbuffered == 0); WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered); slot->slot_unbuffered = (int64_t)mysize; } myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); } /* * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to * signal it has completed copying its piece of the log into * the memory buffer. */ int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) { WT_LOGSLOT *slot; wt_off_t cur_offset, my_start; int64_t my_size, rel_size; WT_UNUSED(session); slot = myslot->slot; my_start = slot->slot_start_offset + myslot->offset; while ((cur_offset = slot->slot_last_offset) < my_start) { /* * Set our offset if we are larger. */ if (__wt_atomic_casiv64( &slot->slot_last_offset, cur_offset, my_start)) break; /* * If we raced another thread updating this, try again. */ WT_BARRIER(); } /* * Add my size into the state and return the new size. */ rel_size = size; if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) rel_size = WT_LOG_SLOT_UNBUFFERED; my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0); return (__wt_atomic_addiv64(&slot->slot_state, my_size)); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact, valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ exact = 0; WT_STAT_FAST_CONN_INCR(session, cursor_search_near); WT_STAT_FAST_DATA_INCR(session, cursor_search_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact * match, search from the root. Unlike WT_CURSOR.search, ignore pinned * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. * * Set the "insert" flag for the btree row-store search; we may intend * to position the cursor at the end of the tree, rather than match an * existing record. */ valid = 0; if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, 1)); /* * Search-near is trickier than search when searching an already * pinned page. If search returns the first or last page slots, * discard the results and search the full tree as the neighbor * pages might offer better matches. This test is simplistic as * we're ignoring append lists (there may be no page slots or we * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->pg_row_entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, 1)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, 1) : __cursor_col_search(session, cbt, NULL)); valid = __cursor_valid(cbt, &upd); } /* * If we find a valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (valid) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, 0)) != WT_NOTFOUND) exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, 1) : __cursor_col_search(session, cbt, NULL)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, 0)) != WT_NOTFOUND) exact = -1; } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }
/* * __log_slot_close -- * Close out the slot the caller is using. The slot may already be * closed or freed by another thread. */ static int __log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *releasep, bool forced) { WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, releasep != NULL); conn = S2C(session); log = conn->log; *releasep = 0; if (slot == NULL) return (WT_NOTFOUND); retry: old_state = slot->slot_state; /* * If this close is coming from a forced close and a thread is in * the middle of using the slot, return EBUSY. The caller can * decide if retrying is necessary or not. */ if (forced && WT_LOG_SLOT_INPROGRESS(old_state)) return (EBUSY); /* * If someone else is switching out this slot we lost. Nothing to * do but return. Return WT_NOTFOUND anytime the given slot was * processed by another closing thread. Only return 0 when we * actually closed the slot. */ if (WT_LOG_SLOT_CLOSED(old_state)) return (WT_NOTFOUND); /* * If someone completely processed this slot, we're done. */ if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) return (WT_NOTFOUND); new_state = (old_state | WT_LOG_SLOT_CLOSE); /* * Close this slot. If we lose the race retry. */ if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) goto retry; /* * We own the slot now. No one else can join. * Set the end LSN. */ WT_STAT_FAST_CONN_INCR(session, log_slot_closes); if (WT_LOG_SLOT_DONE(new_state)) *releasep = 1; slot->slot_end_lsn = slot->slot_start_lsn; end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; slot->slot_end_lsn.offset += (wt_off_t)end_offset; WT_STAT_FAST_CONN_INCRV(session, log_slot_consolidated, end_offset); /* * XXX Would like to change so one piece of code advances the LSN. */ log->alloc_lsn = slot->slot_end_lsn; WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file); return (0); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_FAST_CONN_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCR(session, cursor_remove); WT_STAT_FAST_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); retry: WT_RET(__cursor_func_init(cbt, 1)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible * in __cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* * Creating a record past the end of the tree in a * fixed-length column-store implicitly fills the * gap with empty records. Return success in that * case, the record was deleted successfully. * * Correct the btree cursor's location: the search * will have pointed us at the previous/next item, * and that's not correct. */ cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, 1); break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__cursor_row_search(session, cbt, NULL, 0)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, 1); break; WT_ILLEGAL_VALUE_ERR(session); } err: if (ret == WT_RESTART) { WT_STAT_FAST_CONN_INCR(session, cursor_restart); WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; } /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_cond_wait_signal -- * Wait on a mutex, optionally timing out. If we get it * before the time out period expires, let the caller know. */ int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) { BOOL sleepret; DWORD milliseconds, windows_error; bool locked; uint64_t milliseconds64; locked = false; /* Fast path if already signalled. */ *signalled = true; if (__wt_atomic_addi32(&cond->waiters, 1) == 0) return (0); /* * !!! * This function MUST handle a NULL session handle. */ if (session != NULL) { WT_RET(__wt_verbose(session, WT_VERB_MUTEX, "wait %s cond (%p)", cond->name, cond)); WT_STAT_FAST_CONN_INCR(session, cond_wait); } EnterCriticalSection(&cond->mtx); locked = true; if (usecs > 0) { milliseconds64 = usecs / 1000; /* * Check for 32-bit unsigned integer overflow * INFINITE is max unsigned int on Windows */ if (milliseconds64 >= INFINITE) milliseconds64 = INFINITE - 1; milliseconds = (DWORD)milliseconds64; /* * 0 would mean the CV sleep becomes a TryCV which we do not * want */ if (milliseconds == 0) milliseconds = 1; sleepret = SleepConditionVariableCS( &cond->cond, &cond->mtx, milliseconds); } else sleepret = SleepConditionVariableCS( &cond->cond, &cond->mtx, INFINITE); /* * SleepConditionVariableCS returns non-zero on success, 0 on timeout * or failure. */ if (sleepret == 0) { windows_error = __wt_getlasterror(); if (windows_error == ERROR_TIMEOUT) { *signalled = false; sleepret = 1; } } (void)__wt_atomic_subi32(&cond->waiters, 1); if (locked) LeaveCriticalSection(&cond->mtx); if (sleepret != 0) return (0); __wt_errx(session, "SleepConditionVariableCS: %s", __wt_formatmessage(session, windows_error)); return (__wt_map_windows_error(windows_error)); }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ page = cbt->ref == NULL ? NULL : cbt->ref->page; for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); page = cbt->ref->page; WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ for (newpage = 0;; newpage = 1) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_curfile_create -- * Open a cursor for a given btree handle. */ int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curfile_compare, /* compare */ __curfile_equals, /* equals */ __curfile_next, /* next */ __curfile_prev, /* prev */ __curfile_reset, /* reset */ __curfile_search, /* search */ __curfile_search_near, /* search-near */ __curfile_insert, /* insert */ __curfile_update, /* update */ __curfile_remove, /* remove */ __wt_cursor_reconfigure, /* reconfigure */ __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_CURSOR_BULK *cbulk; WT_DECL_RET; size_t csize; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); cbt = NULL; btree = S2BT(session); WT_ASSERT(session, btree != NULL); csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); WT_RET(__wt_calloc(session, 1, csize, &cbt)); cursor = &cbt->iface; *cursor = iface; cursor->session = &session->iface; cursor->internal_uri = btree->dhandle->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; cbt->btree = btree; if (bulk) { F_SET(cursor, WT_CURSTD_BULK); cbulk = (WT_CURSOR_BULK *)cbt; /* Optionally skip the validation of each bulk-loaded key. */ WT_ERR(__wt_config_gets_def( session, cfg, "skip_sort_check", 0, &cval)); WT_ERR(__wt_curbulk_init( session, cbulk, bitmap, cval.val == 0 ? 0 : 1)); } /* * random_retrieval * Random retrieval cursors only support next, reset and close. */ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); if (cval.val != 0) { __wt_cursor_set_notsup(cursor); cursor->next = __curfile_next_random; cursor->reset = __curfile_reset; } /* __wt_cursor_init is last so we don't have to clean up on error. */ WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); WT_STAT_FAST_CONN_INCR(session, cursor_create); WT_STAT_FAST_DATA_INCR(session, cursor_create); if (0) { err: __wt_free(session, cbt); } return (ret); }